In [2]:
'''
VMP 2022-02-09: 
Checks whether the intended analysis is feasible & 
provides a second sanity check (besides 'check_preprocessing.ipynb')
'''

"\nVMP 2022-02-09: \nChecks whether the intended analysis is feasible & \nprovides a second sanity check (besides 'check_preprocessing.ipynb')\n"

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime, time
pd.options.mode.chained_assignment = None  # default='warn'

# Load relevant files

In [4]:
inpath = "/home/vicp/data/2021-08-02/masters/"

In [None]:
psych = pd.read_csv(f"{inpath}psychology_paper_meta.csv")
econ = pd.read_csv(f"{inpath}economics_paper_meta.csv")
pol = pd.read_csv(f"{inpath}politicalscience_paper_meta.csv")
soc = pd.read_csv(f"{inpath}sociology_paper_meta.csv")

ERROR! Session/line number was not unique in database. History logging moved to new session 823


In [None]:
# to datetime 
def to_datetime(df): 
    df["Date"] = pd.to_datetime(df["Date"]).dt.date
    return df

In [None]:
psych = to_datetime(psych)
econ = to_datetime(econ)
pol = to_datetime(pol)
soc = to_datetime(soc)

In [None]:
psych_sub = pd.read_csv(f"{inpath}psychology_subset.csv")
econ_sub = pd.read_csv(f"{inpath}economics_subset.csv")
pol_sub = pd.read_csv(f"{inpath}politicalscience_subset.csv")
soc_sub = pd.read_csv(f"{inpath}sociology_subset.csv")

# Replication 
Quick check, whether "replicat*" is present (enough) in titles to analyze. <br/>
Other terms, e.g. "reproduc*", or variants of open science / open data could also be interesting. <br/>
Psychology is golden, but the other fields are very small (for replication). <br/>
Options: <br/>
(1) Do just psychology (do this first) <br/>
(2) Less than five-year citation delay (could be okay to get more recent trends) <br/>
(3) Get other field (e.g. some biomedical stuff..?)

In [None]:
def check_replication(df, df_sub): 
    df = df.merge(df_sub, on = 'PaperId', how = 'inner') # only from selection (could also be done smarter). 
    df = df[["PaperId", "PaperTitle", "Date"]].drop_duplicates() # 
    df_replication = df.loc[df['PaperTitle'].str.contains("replicat", case=False)]
    df_replication_before2016 = df_replication[df_replication["Date"] <= datetime.date(2016, 1, 1)]
    print(f"total matches: \n{len(df_replication)} \n")
    print(f"before 2016: \n{len(df_replication_before2016)} \n")
    print(f"overview: \n{df_replication_before2016.head(3)}")
    return df_replication_before2016

In [None]:
# psychology: 
# (1) enough data to be doable.
# (2) several large-scale replications (i.e. large difference between papers and authorships)
psych_replication = check_replication(psych, psych_sub)

In [None]:
# econ
# (1) could still be useful evon though it is less. 
econ_replication = check_replication(econ, econ_sub)

In [None]:
# pol
# (1) probably not enough. 
pol_replication = check_replication(pol, pol_sub)

In [None]:
# soc
# (1) could be enough. 
soc_replication = check_replication(soc, soc_sub)

# Check a few manually to verify term

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
def print_first_n_rows(df, n):
    for i in range(n):
        print(f"row {i}: {df.iloc[i, 1]} \n")

In [None]:
# psychology
# many of these are actually replication studies. 
# could screen e.g. 50-100 manually 
print_first_n_rows(psych_replication, 10)

In [None]:
# a lot of this is NOT replication studies. 
print_first_n_rows(econ_replication, 10)

In [None]:
# also a lot of not actually replication
print_first_n_rows(soc_replication, 10)

In [None]:
# mixed bag here. 
print_first_n_rows(pol_replication, 10)

In [None]:
## take-away: 
# (1) there is a lot of what we are actually looking for in *psychology* but not the other fields. 
# Could do either one of three things: 
## 1. look only at psychology
## 2. take the few studies that are there in the other fields 
## 3. take other fields (e.g. bio-medicine) 
## 4. partial psychology up into sub-fields (e.g. social psychology) - should be in MAG, but reliable?

# Citations over time

In [None]:
# load the "reference" dataframe
psych_references = pd.read_csv("/home/vicp/data/2021-08-02/masters/psychology_reference.csv")

In [None]:
# pick a paper-id (this one is a systematic replication)
focus_paperid = psych_replication.iloc[5, 0]

In [None]:
refs_focus = psych_references[psych_references["PaperReferenceId"] == focus_paperid][['PaperId']]
print(len(refs_focus))

In [None]:
# bind with our other data-frame 
refs_focus_meta = psych.merge(refs_focus, on='PaperId', how = "inner")
print(len(refs_focus_meta))

In [None]:
### check that dates make sense ### 

In [None]:
cites_focus_meta.sort_values('Date', ascending=True).head(5)

In [None]:
# bind focus article with other data-frame
focus_article_meta = psych[psych["PaperId"] == focus_paperid]
focus_article_meta.head()

In [None]:
### plot it ### 
# (1) of course this is terrible 
# (2) we need to insert when the article was published, but this shows that it works 
# (3) need to implement the c_5 so that we cap citations at five years. 

In [None]:
cites_focus_meta = cites_focus_meta.sort_values('Date', ascending=True).reset_index()

In [None]:
cites_focus_meta['value'] = [i+1 for i, row in cites_focus_meta.iterrows()]

In [None]:
plt.fill_between(cites_focus_meta.Date, cites_focus_meta.value)
plt.xticks(rotation = 45)
plt.title(f"citations for id: {focus_paperid}")
plt.show();

# Matching on team size & date

In [None]:
### NB: 
# (1) we need to make sure that the papers we are matching with are NOT in e.g. "replication" set. 

In [None]:
# same paper as before: 
focus_paperid = psych_replication.iloc[5, 0]

In [None]:
# get "team-size" for all papers in psychology
psych_paa = pd.read_csv(f"{inpath}psychology_paper_author.csv")
psych_team_size = psych_paa.groupby('PaperId').size().to_frame('n_authors').reset_index()

In [None]:
# get date as well 
psych_team_date = psych.merge(psych_team_size, on = 'PaperId', how = 'inner')[["PaperId", "Date", "n_authors"]]

In [None]:
# get the data for our focus paper
focus_paper_date_authors = psych_team_date[psych_team_date["PaperId"] == focus_paperid]

In [None]:
focus_date = focus_paper_date_authors.iloc[0, 1]
focus_nauthors = focus_paper_date_authors.iloc[0, 2]

In [None]:
print(focus_date)
print(focus_nauthors)

In [None]:
# exact matching: 
matches = psych_team_date[(psych_team_date["Date"] == focus_date) & 
                (psych_team_date["n_authors"] == focus_nauthors)]

In [None]:
# randomly sample a matching study: 
random_match = matches.sample(n = 1, random_state = 231)

In [None]:
# we now have a random match
random_match