In [1]:
'''
VMP 2022-02-24: 
Checking fraction of publications getting from preprint to publication stage. 
Using early preprocessing from "preprocessing". 
'''

'\nVMP 2022-02-24: \nChecking fraction of publications getting from preprint to publication stage. \nUsing early preprocessing from "preprocessing". \n'

In [2]:
import sys  
sys.path.insert(0, '/home/vicp')
from MAGsparkmasters import get_mag_with_cluster_connection
from MAGmasters import MicrosoftAcademicGraph
import os
from pyspark.sql import functions as F, Window
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import numpy as np
from pyspark.sql.functions import avg
os.chdir('/home/vicp')
mag, spark = get_mag_with_cluster_connection(66655, 
                               memory_per_executor=16000)

['NAME STATE JOBID', 'main_job PENDING 66636', 'smaller PENDING 66656', 'cluster_new.job RUNNING 66655', 'cluster_new.job RUNNING 66653', 'cluster_new.job RUNNING 66650', 'main_job RUNNING 66634', 'resnet50-5-epochs-cn4 RUNNING 66654', 'jupyter-notebook RUNNING 66649', 'tpcx-ai_benchmark.job RUNNING 66629', 'train-gan RUNNING 66518', '3.deberta.0 RUNNING 66631', '3.deberta.1 RUNNING 66630', 'small.2 RUNNING 66546', '']


In [3]:
spark

# Load data
Data-sets curated in "preprocessing". <br/>
Taking the full data-sets for psychology, economy, sociology and political science. 

In [4]:
psychology = mag.getDataframe('psychology_total')

In [5]:
papers = mag.getDataframe('Papers')

# Select preprint data
only to 2020 because there can be delay from preprint to publication. 

In [7]:
psychology_preprints = papers.join(psychology, ['PaperId'], 'inner') \
    .filter(F.col('DocType') == 'Repository') \
    .filter((F.col('Date') >= datetime.date(2005, 1, 1)) & (F.col('Date') <= datetime.date(2020, 1, 1))) \
    .select('PaperId', 'FamilyId', 'PaperTitle', 'DocType', 'Date') \
    .distinct()

# Join by FamilyId

In [9]:
psychology_selection = psychology_preprints.select('FamilyId') \
    .join(papers, ['FamilyId'], 'inner') \
    .select('PaperId', 'FamilyId', 'PaperTitle', 'DocType', 'Date') \
    .distinct() \
    .toPandas() \
    .to_csv("/home/vicp/data/2021-08-02/masters/psychology_preprints.csv", index = False)

# Quick data check (before Ucloud)

## General sanity check & cleaning

In [1]:
import pandas as pd
psych = pd.read_csv("/home/vicp/data/2021-08-02/masters/psychology_preprints.csv")

In [4]:
# looks good (should maybe have gathered field of study as well)
psych.head(5)

Unnamed: 0,PaperId,FamilyId,PaperTitle,DocType,Date
0,16083728,16083728,the virtue ethics hypothesis is there a nexus ...,Repository,2013-01-01
1,3123588271,16083728,the virtue ethics hypothesis is there a nexus ...,Repository,2013-01-01
2,64646616,64646616,how often to a museum motivations matter,Repository,2014-08-01
3,3121719317,64646616,how often to a museum motivations matter,Repository,2014-01-01
4,2952193591,81493280,ambiguous proximity distribution,Repository,2014-06-02


In [6]:
# there are some preprints that end up being published as e.g. book
psych.groupby('DocType').size().reset_index(name = 'overall_fraction')

Unnamed: 0,DocType,overall_fraction
0,Book,31
1,BookChapter,44
2,Conference,4779
3,Dataset,1
4,Journal,20597
5,Repository,46844
6,Thesis,3


In [7]:
# filter out everything besides Repository, Conference and Journal

In [8]:
psych_clean = psych[psych["DocType"].isin(["Repository", "Conference", "Journal"])]

In [10]:
psych_clean.groupby('DocType').size().reset_index(name = 'overall_fraction')

Unnamed: 0,DocType,overall_fraction
0,Conference,4779
1,Journal,20597
2,Repository,46844


In [11]:
# now check how many FamilyId is in more than just preprint 
# a paper can (apparently) be publised multiple times as preprint
# should also be careful that a paper might be published in both Journal and Conference after repo. 

In [14]:
psych_pairs = psych_clean[["FamilyId", "PaperTitle", "DocType"]].drop_duplicates()

In [25]:
psych_pairs.head(5)

Unnamed: 0,FamilyId,PaperTitle,DocType
0,16083728,the virtue ethics hypothesis is there a nexus ...,Repository
2,64646616,how often to a museum motivations matter,Repository
4,81493280,ambiguous proximity distribution,Repository
5,81493280,ambiguous proximity distribution,Conference
6,99661415,becker meets ricardo a social and cognitive sk...,Repository


In [None]:
# check that all of the single ones are repostory (yes)

In [30]:
psych_control = psych_pairs.drop_duplicates('FamilyId', keep=False)

In [31]:
psych_control.groupby('DocType').size()

DocType
Repository    9587
dtype: int64

# check overall 

In [23]:
psych_overall = psych_pairs.groupby('FamilyId').size().reset_index(name = 'n_family').groupby('n_family').size().reset_index(name = 'occurence')

In [24]:
psych_overall

Unnamed: 0,n_family,occurence
0,1,9587
1,2,24554
2,3,544
3,4,66
4,5,16
5,6,5
6,7,1


In [26]:
# gathered n_family = 1 against everything else. 

In [33]:
psych_fraction = psych_overall.assign(fraction = lambda x: x["occurence"]/sum(x["occurence"]))

In [34]:
psych_fraction

Unnamed: 0,n_family,occurence,fraction
0,1,9587,0.275702
1,2,24554,0.706123
2,3,544,0.015644
3,4,66,0.001898
4,5,16,0.00046
5,6,5,0.000144
6,7,1,2.9e-05


In [None]:
# 27.5% of psychology preprints do not get published in either Journals or at Conferences 

# check replications

In [50]:
keyword = "replicat"

In [35]:
psych_pairs.head(5)

Unnamed: 0,FamilyId,PaperTitle,DocType
0,16083728,the virtue ethics hypothesis is there a nexus ...,Repository
2,64646616,how often to a museum motivations matter,Repository
4,81493280,ambiguous proximity distribution,Repository
5,81493280,ambiguous proximity distribution,Conference
6,99661415,becker meets ricardo a social and cognitive sk...,Repository


In [51]:
df_keyword_preprint = psych_pairs[psych_pairs["DocType"] == "Repository"]

In [53]:
df_keyword_preprint = df_keyword_preprint.loc[df_keyword_preprint['PaperTitle'].str.contains("replicat", case=False)]

In [54]:
len(df_keyword) # we have very few papers here

60

In [None]:
# merge back in 

In [66]:
replication_ids = list(df_keyword["FamilyId"])

In [67]:
psych_pairs["keyword"] = ['match' if x in replication_ids else "no_match" for x in psych_pairs["FamilyId"]]

In [72]:
# was 100 when doing simple matching, we want everything that was ORIGINALLY (in preprint) *replicat*
# so this is good in other words 
psych_pairs.groupby('keyword').size().reset_index(name = 'n_replicat').head()

Unnamed: 0,keyword,n_replicat
0,match,105
1,no_match,60603


In [73]:
psych_pairs.head(5)

Unnamed: 0,FamilyId,PaperTitle,DocType,keyword
0,16083728,the virtue ethics hypothesis is there a nexus ...,Repository,no_match
2,64646616,how often to a museum motivations matter,Repository,no_match
4,81493280,ambiguous proximity distribution,Repository,no_match
5,81493280,ambiguous proximity distribution,Conference,no_match
6,99661415,becker meets ricardo a social and cognitive sk...,Repository,no_match


In [None]:
# subset the *match* records

In [74]:
df_match = psych_pairs[psych_pairs["keyword"] == "match"]

In [76]:
df_match = df_match.groupby('FamilyId').size().reset_index(name = 'n_family').groupby('n_family').size().reset_index(name = 'occurence')

In [77]:
keyword_fraction = df_match.assign(fraction = lambda x: x["occurence"]/sum(x["occurence"]))

In [78]:
keyword_fraction # 27.1% are only published as preprints (same as original). 

Unnamed: 0,n_family,occurence,fraction
0,1,16,0.271186
1,2,40,0.677966
2,3,3,0.050847
