In [1]:
import json
import os
import pandas as pd
from jedi.inference.helpers import is_string

In [2]:
# Get an overview of the updated dataset

# Load the dataset
df = pd.read_csv('./data/science_submissions_2022_10.csv')

In [3]:
df.head()

Unnamed: 0,id,author,created_utc,subreddit,score,url,title,domain,upvote_ratio,link_flair_text,author_flair_text,num_comments,comment_id,comment,comment_url
0,xsl4ut,MistWeaver80,1664595769,science,20794,https://link.springer.com/article/10.1007/s108...,A new look at an extremely rare female infant ...,link.springer.com,0.95,Anthropology,,556.0,,,
1,xsmhqo,[deleted],1664600103,science,1,,Dogs can discriminate between human baseline a...,,1.0,Animal Science,,2.0,,,
2,xsmuda,BoundariesAreFun,1664601279,science,145,https://www.insidehighered.com/news/2022/09/29...,New study explores why people drop out or don'...,insidehighered.com,0.87,Social Science,,62.0,,,
3,xsmxjz,TurretLauncher,1664601581,science,365,https://pubmed.ncbi.nlm.nih.gov/36175792/,Researchers identify the sodium leak channel n...,pubmed.ncbi.nlm.nih.gov,0.94,Medicine,,11.0,iqlc1th,**Abstract**\n\nWe identify the sodium leak ch...,
4,xsmxjz,TurretLauncher,1664601581,science,365,https://pubmed.ncbi.nlm.nih.gov/36175792/,Researchers identify the sodium leak channel n...,pubmed.ncbi.nlm.nih.gov,0.94,Medicine,,11.0,iqlcbfk,**Potential cancer breakthrough as scientists ...,


In [4]:
# For each post, we will evaluate the jargon metric for the title. We will evaluate the jargon metric for all the subcategories in the belonging category from the link_flair_text column
from jargon_metric import jargon_proportions as jp

In [5]:
# For all the posts in each of the files in ./data/science_csvs, calculate the proportion of jargon words (jargon words for the category are defined in reddit_categories_pmi folder with a file named <link_flair_text>) in the title of the post
# The results will be saved in a new column in the dataframe

# Load the jargon words for each category
jargon_words = {}

for file in os.listdir('./data/reddit_categories_pmi'):
    # Each file contains a list of jargon words for the category with its name. The words are separated by a newline character
    with open(f'./data/reddit_categories_pmi/{file}', 'r') as f:
        jargon_words[file] = f.read().split('\n')


In [6]:
print(jargon_words.keys())

dict_keys(['Anthropology', 'Nanoscience', 'Neuroscience', 'Astronomy', 'Chemistry', 'Health', 'Psychology', 'Mathematics', 'Paleontology', 'Earth Science', 'Epidemiology', 'Materials Science', 'Geology', 'Cancer', 'Computer Science', 'Medicine', 'Economics', 'Environment', 'Genetics', 'Physics', 'Social Science', 'Engineering', 'Animal Science', 'Biology'])


In [7]:
def calculate_jargon_proportion(text: str, category: str):
    """
    Calculate the proportion of jargon words in the text for the category
    """
    # if category is not a string: return 0
    if not isinstance(category, str):
        return 0
    # Get the jargon words for the category
    jargon_words_for_cat = jargon_words[category]
    # Calculaate the proportion of jargon words in the text compared to the total amount of words in the text
    words = text.split()
    num_words = len(words)
    if num_words == 0:
        return 0
    num_jargon_words = len(set(words).intersection(set(jargon_words_for_cat)))
    proportion = num_jargon_words / num_words
    return proportion

    

In [11]:

num_no_category = 0
num_no_title = 0
num_no_jargon_words = 0
invalid_files = set()
invalid_categories = {}
invalid_posts = []
posts_read = set()
num_irrelevant_categories = 0

for file in os.listdir('./data/science_csvs'):
    try: 
        df = pd.read_csv(f'./data/science_csvs/{file}')
    except Exception as e:
        print("Invalid file: ", file)
        invalid_files.add(file)
        continue
    for i, row in df.iterrows():
        post_id = row['id']
        if post_id in posts_read:
            continue
        category = row['link_flair_text']
        # filter away categories we dont want to include
        # We dont want to include AMA categories (they end with AMA), or dicussion categories
        if not category or not isinstance(category, str) or category.endswith('AMA') or category.endswith('Discussion'):
            num_irrelevant_categories += 1
            continue
        if category == "Computer Sci": 
            category = "Computer Science"
        title = row['title']
        posts_read.add(post_id)
        if not title:
            num_no_title += 1
            continue
        if not category:
            num_no_category += 1
            continue
        try: 
            jargon_proportion = calculate_jargon_proportion(title, category)
        except KeyError as e:
            invalid_posts.append(id)
            if category in invalid_categories:
                invalid_categories[category] += 1
            else:
                invalid_categories[category] = 1
            continue
        df.at[i, 'jargon_proportion'] = jargon_proportion
    df.to_csv(f'./data/r_science_jargon_metrics/{file}_jargon.csv', index=False)

Invalid category:  Planetary Exploration
Invalid category:  Ape Communication
Invalid category:  Swimming Pool Chemistry
Invalid category:  Meta
Invalid category:  Meta
Invalid category:  Meta
Invalid category:  History of Science
Invalid category:  Diversity in STEM
Invalid category:  Diversity in STEM
Invalid category:  Possibly Misleading
Invalid category:  Meta
Invalid category:  Science Communication
Invalid category:  Marine Ecology in Coastal Systems
Invalid category:  Hurricane Research
Invalid category:  News
Invalid category:  Subreddit News
Invalid file:  .DS_Store
Invalid category:  Best of r/science
Invalid category:  Misleading Title
Invalid category:  Misleading Title
Invalid category:  Battery Discussion Series
Invalid category:  Breaking News
Invalid category:  Breaking News
Invalid category:  Breaking News
Invalid category:  Breaking News
Invalid category:  Best of r/science
Invalid category:  Science Political Action
Invalid category:  Breaking News
Invalid category:

In [12]:
# Number of fails: 
print(f"Number of invalid categories: {len(invalid_categories)}")
print(f"Number of invalid posts: {len(invalid_posts)}")
print(f"Number of posts with no title: {num_no_title}")
print(f"Number of posts with no category: {num_no_category}")
print(f"Number of invalid files: {len(invalid_files)}")
print(f"Number of irrelevant categories: {num_irrelevant_categories}")

Number of invalid categories: 78
Number of invalid posts: 164
Number of posts with no title: 0
Number of posts with no category: 0
Number of invalid files: 1
Number of irrelevant categories: 121544


In [16]:
# Sort invalid categories by number of occurences
sorted_invalid_categories = sorted(invalid_categories.items(), key=lambda x: x[1], reverse=True)
print(sorted_invalid_categories)


[('Breaking News', 34), ('Conservation Panel Discussion ', 10), ('Meta', 8), ('Subreddit News', 8), ('Best of r/science', 7), ('Retraction', 6), ('Misleading Title', 4), ('Subreddit Feature', 4), ('News', 3), ('RETRACTED - Epidemiology', 3), ('RETRACTED - Medicine', 3), ('Diversity in STEM', 2), ('RETRACTED - Biology', 2), ('Subreddit Policy', 2), ('fucntion test ama', 2), ('In Mice', 2), ('RETRACTED - Social Science', 2), ('test', 2), ('Planetary Exploration', 1), ('Ape Communication', 1), ('Swimming Pool Chemistry', 1), ('History of Science', 1), ('Possibly Misleading', 1), ('Science Communication', 1), ('Marine Ecology in Coastal Systems', 1), ('Hurricane Research', 1), ('Battery Discussion Series', 1), ('Science Political Action', 1), ('AMA: Microbes, Health, and Society', 1), ('Net Neutrality', 1), ('Citizen science and the Flint water crisis', 1), ('Test post, ignore', 1), ('Extremophiles', 1), ('Adolescent Health', 1), ('RETRACTED - Health', 1), ('Unanswered Questions in Science