In [2]:
import json
import os
import pandas as pd

In [3]:
# Get an overview of the updated dataset

# Load the dataset
df = pd.read_csv('./data/science_submissions_2022_10.csv')

In [4]:
df.head()

Unnamed: 0,id,author,created_utc,subreddit,score,url,title,domain,upvote_ratio,link_flair_text,author_flair_text,num_comments,comment_id,comment,comment_url
0,xsl4ut,MistWeaver80,1664595769,science,20794,https://link.springer.com/article/10.1007/s108...,A new look at an extremely rare female infant ...,link.springer.com,0.95,Anthropology,,556.0,,,
1,xsmhqo,[deleted],1664600103,science,1,,Dogs can discriminate between human baseline a...,,1.0,Animal Science,,2.0,,,
2,xsmuda,BoundariesAreFun,1664601279,science,145,https://www.insidehighered.com/news/2022/09/29...,New study explores why people drop out or don'...,insidehighered.com,0.87,Social Science,,62.0,,,
3,xsmxjz,TurretLauncher,1664601581,science,365,https://pubmed.ncbi.nlm.nih.gov/36175792/,Researchers identify the sodium leak channel n...,pubmed.ncbi.nlm.nih.gov,0.94,Medicine,,11.0,iqlc1th,**Abstract**\n\nWe identify the sodium leak ch...,
4,xsmxjz,TurretLauncher,1664601581,science,365,https://pubmed.ncbi.nlm.nih.gov/36175792/,Researchers identify the sodium leak channel n...,pubmed.ncbi.nlm.nih.gov,0.94,Medicine,,11.0,iqlcbfk,**Potential cancer breakthrough as scientists ...,


In [5]:
# For each post, we will evaluate the jargon metric for the title. We will evaluate the jargon metric for all the subcategories in the belonging category from the link_flair_text column
from jargon_metric import jargon_proportions as jp

0
link_flair_text
Health               984
Environment          285
Psychology           259
Medicine             246
Social Science       246
Biology              200
Astronomy            174
Neuroscience         154
Animal Science       133
Computer Science     130
Cancer                91
Earth Science         89
Physics               81
Epidemiology          74
Genetics              69
Engineering           51
Economics             44
Chemistry             41
Materials Science     30
Anthropology          28
Paleontology          24
Nanoscience           22
Mathematics           13
Geology               11
Breaking News          5
Name: count, dtype: int64


In [14]:
# For all the posts in each of the files in ./data/science_csvs, calculate the proportion of jargon words (jargon words for the category are defined in reddit_categories_pmi folder with a file named <link_flair_text>) in the title of the post
# The results will be saved in a new column in the dataframe

# Load the jargon words for each category
jargon_words = {}

for file in os.listdir('./data/reddit_categories_pmi'):
    # Each file contains a list of jargon words for the category with its name. The words are separated by a newline character
    with open(f'./data/reddit_categories_pmi/{file}', 'r') as f:
        jargon_words[file] = f.read().split('\n')


In [15]:
print(jargon_words.keys())

dict_keys(['Anthropology', 'Nanoscience', 'Neuroscience', 'Astronomy', 'Chemistry', 'Health', 'Psychology', 'Mathematics', 'Paleontology', 'Earth Science', 'Epidemiology', 'Materials Science', 'Geology', 'Cancer', 'Computer Science', 'Medicine', 'Economics', 'Environment', 'Genetics', 'Physics', 'Social Science', 'Engineering', 'Animal Science', 'Biology'])


In [37]:
def calculate_jargon_proportion(text: str, category: str):
    """
    Calculate the proportion of jargon words in the text for the category
    """
    # if category is not a string: return 0
    if not isinstance(category, str):
        return 0
    # Get the jargon words for the category
    jargon_words_for_cat = jargon_words[category]
    # Calculaate the proportion of jargon words in the text compared to the total amount of words in the text
    words = text.split()
    num_words = len(words)
    if num_words == 0:
        return 0
    num_jargon_words = len(set(words).intersection(set(jargon_words_for_cat)))
    proportion = num_jargon_words / num_words
    return proportion

    

In [None]:

num_no_category = 0
num_no_title = 0
num_no_jargon_words = 0
invalid_files = set()
invalid_categories = {}
invalid_posts = []

for file in os.listdir('./data/science_csvs'):
    try: 
        df = pd.read_csv(f'./data/science_csvs/{file}')
    except Exception as e:
        print("Invalid file: ", file)
        invalid_files.add(file)
        continue
    for i, row in df.iterrows():
        category = row['link_flair_text']
        title = row['title']
        if not title:
            num_no_title += 1
            continue
        if not category:
            num_no_category += 1
            continue
        try: 
            jargon_proportion = calculate_jargon_proportion(title, category)
        except KeyError as e:
            print("Invalid category: ", category)
            invalid_posts.append(row["id"])
            if category in invalid_categories:
                invalid_categories[category] += 1
            else:
                invalid_categories[category] = 1
            continue
        df.at[i, 'jargon_proportion'] = jargon_proportion
    df.to_csv(f'./data/r_science_jargon_metrics/{file}_jargon.csv', index=False)

In [51]:
# Number of fails: 
print(f"Number of invalid categories: {len(invalid_categories)}")
print(f"Number of invalid posts: {len(invalid_posts)}")
print(f"Number of posts with no title: {num_no_title}")
print(f"Number of posts with no category: {num_no_category}")
print(f"Number of invalid files: {len(invalid_files)}")

#Number of unique invalid posts: 
print(f"Number of unique invalid posts: {len(set(invalid_posts))}")
# Number of invalid categories that do not end with AMA:
print(f"Number of invalid categories that do not end with AMA: {len([category for category in invalid_categories.keys() if not category.endswith('AMA')])}")

Number of invalid categories: 630
Number of invalid posts: 49986
Number of posts with no title: 0
Number of posts with no category: 0
Number of invalid files: 1
Number of unique invalid posts: 981
Number of invalid categories that do not end with AMA: 107


In [46]:
# print the invalid categories that do not end with AMA: 



107
['DNA Day Series | Genomics', 'Best of r/science', 'Food Chemistry', 'Social Media Discussion', 'Race in Applied Science Discussion', 'Air Purification', 'Meta', 'Computer Sci', 'Science Discussion', 'Deaths + injuries', 'Diversity in Stem Discussion', 'Marine Ecology in Coastal Systems', 'Includes treatment', 'Police Discussion', 'Climate Discussion', 'Ape Communication', 'March for Science Discussion', 'Breaking News', 'Realistic Robots', 'CRISPR Babies Discussion', 'Citizen science and the Flint water crisis', 'Extremophiles', 'Psychology - Author in Comments', 'Nobel Prize Discussion', 'Pet for Better Mental Health', 'Health Inequity Discussion', 'Science Communication', 'Science Political Action', 'NASA Kepler Mission', 'DNA Day Series | The Cancer Genome Atlas', 'Possibly Misleading', 'Personal Genomics Discussion', 'RETRACTED - Biology', 'Harmful Algal Bloom Oceanography', 'Announcement', 'Traumatic Brain Injury Discussion', 'Climate Change Communication', 'RETRACTED - Socia