In [1]:
'''
VMP 2022-03-10: 
Checking fraction of publications getting from preprint to publication stage. 
Using early preprocessing from "preprocessing" (all psychology papers). 
'''

'\nVMP 2022-03-10: \nChecking fraction of publications getting from preprint to publication stage. \nUsing early preprocessing from "preprocessing". \n'

In [2]:
import sys  
sys.path.insert(0, '/home/vicp/reform-psychology/MAG-data-curation')
from MAGsparkmasters import get_mag_with_cluster_connection
from MAGmasters import MicrosoftAcademicGraph
import os
from pyspark.sql import functions as F, Window
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import numpy as np
from pyspark.sql.functions import avg
os.chdir('/home/vicp/reform-psychology/MAG-data-curation')
mag, spark = get_mag_with_cluster_connection(67299, 
                               memory_per_executor=16000)

['NAME STATE JOBID', 'mpi RUNNING 67405', 'cluster_new.job RUNNING 67299', 'run_all_benchmarks.job RUNNING 67318', 'jupyter-notebook RUNNING 67298', '4.train.0 RUNNING 67406', '4.train.1 RUNNING 67407', '4.train.2 RUNNING 67408', '4.train.3 RUNNING 67409', 'bash RUNNING 67402', 'jupyter-notebook RUNNING 67397', '']


In [3]:
spark

# Load data
Data-sets curated in "preprocessing". <br/>
Taking the full data-sets for psychology, economy, sociology and political science. 

In [4]:
psychology = mag.getDataframe('psychology_papers')

In [5]:
papers = mag.getDataframe('Papers')

# Select preprint data
only to 2020 because there can be delay from preprint to publication. 

In [6]:
psychology_preprints = papers.join(psychology, ['PaperId'], 'inner') \
    .filter(F.col('DocType') == 'Repository') \
    .filter((F.col('Date') >= datetime.date(2005, 1, 1)) & (F.col('Date') <= datetime.date(2020, 1, 1))) \
    .select('PaperId', 'FamilyId', 'PaperTitle', 'DocType', 'Date') \
    .distinct()

# Join by FamilyId
NB: some papers will not have a FamilyId, which we account for.

In [7]:
psychology_selection = psychology_preprints.select('FamilyId') \
    .join(papers, ['FamilyId'], 'inner') \
    .select('PaperId', 'FamilyId', 'PaperTitle', 'DocType', 'Date') \
    .distinct()

# Convert to pandas

In [8]:
# should probably just save here & do the rest of the analysis on Ucloud
psych_pp_df = psychology_preprints.toPandas()
psych_fam_df = psychology_selection.toPandas()

# Keyword / Query

In [9]:
psych_pp_keyword = psych_pp_df.loc[psych_pp_df['PaperTitle'].str.contains("replicat", case=False)]

In [10]:
## prepare keyword subset 
def prepare_keys(df_keyword, df_family): 
    '''
    df_keyword: <pd.dataframe> the dataframe with preprints that match keywords 
    df_family: <pd.dataframe> the dataframe of preprints that are matched with family id 
    '''
    df_keyword_fam = df_keyword[~df_keyword["FamilyId"].isnull()]
    df_keyword_paperid = df_keyword_fam[["PaperId"]].drop_duplicates()
    df_family_keyword = df_keyword_paperid.merge(df_family,
                                                    on = 'PaperId', 
                                                    how = 'outer', 
                                                    indicator = True)
    df_family_sub = df_family_keyword[df_family_keyword['_merge'] == 'both'][["FamilyId"]]
    df_keyword_family = df_family_keyword.merge(df_family_sub, on = 'FamilyId', how = 'inner')
    return df_keyword_family

In [11]:
psych_key_fam = prepare_keys(psych_pp_keyword, psych_fam_df)

# total number of preprints (match & no-match)

In [12]:
total_preprints = len(psych_pp_df)
total_preprints_keyword = len(psych_pp_keyword)

# Get fraction that get published later

In [13]:
def get_fraction(df_family, total_preprints): 
    '''
    df_family: <pd.dataframe> the preprints that are matched on FamilyId
    df_preprints: <int> total number of preprints before matching.
    '''
    doctype_lst = ['Journal', 'Conference']
    df_focus = df_family[df_family["DocType"].isin(["Repository", "Conference", "Journal"])]
    df_maxdate = df_focus.groupby('FamilyId')['Date'].max().reset_index(name = 'Date')
    df_lastpub = df_maxdate.merge(df_focus, on = ['FamilyId', 'Date'], how = 'inner')
    df_peerreview = df_lastpub[df_lastpub["DocType"].isin(doctype_lst)]
    total_peerreview = len(df_peerreview)
    print(f"total preprints: {total_preprints}")
    print(f"total preprint --> publication: {total_peerreview}")
    print(f"total fraction preprint --> publication: {round(total_peerreview/total_preprints*100, 2)}%")

In [14]:
# get fraction of overall psychology papers
get_fraction(psych_fam_df, total_preprints)

total preprints: 71836
total preprint --> publication: 14770
total fraction preprint --> publication: 20.56%


In [15]:
# get fraction of psychology papers that match keyword: 
get_fraction(psych_key_fam, total_preprints_keyword)

total preprints: 126
total preprint --> publication: 36
total fraction preprint --> publication: 28.57%
