In [1]:
'''
VMP 2022-02-09: 
check overlap between psychology, economy, political science, sociology and reproducibility, open science and replication
'''

'\nVMP 2022-02-09: \ncheck overlap between psychology, economy, political science, sociology and reproducibility, open science and replication\n'

In [2]:
import sys  
sys.path.insert(0, '/home/vicp')
from MAGsparkmasters import get_mag_with_cluster_connection
from MAGmasters import MicrosoftAcademicGraph
import os
from pyspark.sql import functions as F, Window
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import numpy as np
from pyspark.sql.functions import avg
os.chdir('/home/vicp')
mag, spark = get_mag_with_cluster_connection(66005, 
                               memory_per_executor=16000)

['NAME STATE JOBID', 'cluster_new.job RUNNING 66005', 'jupyter-notebook RUNNING 65985', '1.train.0 RUNNING 65995', '4.train.0 RUNNING 65986', '4.train.1 RUNNING 65987', '4.train.2 RUNNING 65988', '4.train.3 RUNNING 65989', '4.train.4 RUNNING 65990', '4.train.5 RUNNING 65991', '']


In [3]:
spark

# Load data

In [23]:
# load our documents 
psych = mag.getDataframe('psychology_total')
econ = mag.getDataframe('economics_total')
pol = mag.getDataframe('politicalscience_total')
soc = mag.getDataframe('sociology_total')

In [24]:
# load other data-sets 
FoS = mag.getDataframe('FoS')
pfs = mag.getDataframe('PaperFieldsOfStudy')

# Gather by search terms (open science, reproducibility, replication)

In [25]:
open_science = FoS.filter(F.col('NormalizedName') == 'open science') \
    .select('FieldOfStudyId', 'NormalizedName')

In [26]:
reproducibility = FoS.filter(F.col('NormalizedName') == 'reproducibility') \
    .select('FieldOfStudyId', 'NormalizedName')

In [27]:
replication = FoS.filter((F.col('NormalizedName') == 'replication') & (F.col('Level') == 2)) \
    .select('FieldOfStudyId', 'NormalizedName')

# Get overlap between main fields & sub-categories of interest

In [34]:
def get_overlap(fos, sub_fos): 
    '''
    fos: <str> main field of study (e.g. "psychology" or "economics")
    sub_fos: <str> sub-field, e.g. "open science" or "reproducibility"
    '''
    
    # load stuff
    fos_total = mag.getDataframe(f'{fos}_total')
    FoS = mag.getDataframe('FoS')
    pfs = mag.getDataframe('PaperFieldsOfStudy')
    subfos_df = FoS.filter((F.col('NormalizedName') == sub_fos) & (F.col('Level') == 2)) \
        .select('FieldOfStudyId', 'NormalizedName') 
    
    # for output
    sub_fos_name = sub_fos.replace(" ", "")
    
    # join everything to get the articles 
    subfos_df.join(pfs, ['FieldOfStudyId'], "inner") \
        .join(fos_total, ['PaperId'], "inner") \
        .select('PaperId', 'NormalizedName') \
        .distinct() \
        .toPandas() \
        .to_csv(f"/home/vicp/data/2021-08-02/masters/{fos}_{sub_fos_name}.csv", index = False)
    
    # print 
    print(f"--- finished computing overlap between {fos} and {sub_fos} \n")

In [None]:
for fos in ["psychology", "economics", "politicalscience", "sociology"]: 
    for sub_fos in ["open science", "replication", "reproducibility"]: 
        get_overlap(fos, sub_fos)

--- finished computing overlap between psychology and open science 

--- finished computing overlap between psychology and replication 

--- finished computing overlap between psychology and reproducibility 

--- finished computing overlap between economics and open science 

--- finished computing overlap between economics and replication 

--- finished computing overlap between economics and reproducibility 



# Rough check on results

In [None]:
# just run this loop: 
for fos in ["psychology", "economics", "politicalscience", "sociology"]: 
    for sub_fos in ["openscience", "replication", "reproducibility"]: 
        case = pd.read_csv(f"/home/vicp/data/2021-08-02/masters/{fos}_{sub_fos}.csv")
        n_records = len(case) 
        print(f"--- overlap between {fos} and {sub_fos} is {n_records} \n")