In [None]:
'''
VMP 2022-02-08: 
key document for checking preprocessing. 
also checks whether the analysis broadly is feasible. 
'''

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime, time
pd.options.mode.chained_assignment = None  # default='warn'

# Read paper-meta files

In [2]:
inpath = "/home/vicp/data/2021-08-02/masters/"

In [3]:
psych = pd.read_csv(f"{inpath}psychology_paper_meta.csv")
econ = pd.read_csv(f"{inpath}economics_paper_meta.csv")
pol = pd.read_csv(f"{inpath}politicalscience_paper_meta.csv")
soc = pd.read_csv(f"{inpath}sociology_paper_meta.csv")

In [20]:
# to datetime 
def to_datetime(df): 
    df["Date"] = pd.to_datetime(df["Date"]).dt.date
    return df

In [21]:
psych = to_datetime(psych)
econ = to_datetime(econ)
pol = to_datetime(pol)
soc = to_datetime(soc)

# Quick Check

In [10]:
def quick_check(df): 
    doctype = df.groupby('DocType').size().to_frame('size').reset_index().sort_values('size', ascending=False)
    field = df.groupby('NormalizedName').size().to_frame('size').reset_index().sort_values('size', ascending=False)
    print(f"*rows*: \n{len(df)} \n")
    print(f"*DocType*: \n{doctype.head(5)} \n")
    print(f"*field*: \n{field.head(5)}")

In [11]:
# economics
quick_check(econ)

*rows*: 
1970876 

*DocType*: 
       DocType     size
4      Journal  1401606
6   Repository   130121
2   Conference    41301
7       Thesis    34896
1  BookChapter    33365 

*field*: 
       NormalizedName    size
5           economics  739880
2            business  383239
16  political science  165167
4    computer science  146163
17         psychology  101906


In [12]:
# psychology
quick_check(psych)

*rows*: 
4757436 

*DocType*: 
       DocType     size
4      Journal  3751064
2   Conference   110516
6   Repository    86661
7       Thesis    74429
1  BookChapter    68592 

*field*: 
      NormalizedName     size
17        psychology  2342205
13          medicine   968751
4   computer science   347418
18         sociology   237605
1            biology   187561


In [13]:
# pol
quick_check(pol)

*rows*: 
1794879 

*DocType*: 
       DocType     size
4      Journal  1426469
1  BookChapter    32583
6   Repository    31346
7       Thesis    26069
2   Conference    16399 

*field*: 
       NormalizedName    size
16  political science  919993
18          sociology  167959
2            business  145217
17         psychology  138343
13           medicine  100862


In [14]:
# sociology
quick_check(soc)

*rows*: 
2414270 

*DocType*: 
       DocType     size
4      Journal  1820678
1  BookChapter    57329
7       Thesis    47623
2   Conference    35779
6   Repository    28303 

*field*: 
       NormalizedName     size
18          sociology  1088640
17         psychology   372368
16  political science   283122
2            business   166410
4    computer science   108061


In [None]:
## overall looks good, but one caveat: 
# (1) we are not seing "None" for DocType here because it is somehow "hidden". 

# How many are part of original "focus" articles

In [22]:
psych_sub = pd.read_csv(f"{inpath}psychology_subset.csv")
econ_sub = pd.read_csv(f"{inpath}economics_subset.csv")
pol_sub = pd.read_csv(f"{inpath}politicalscience_subset.csv")
soc_sub = pd.read_csv(f"{inpath}sociology_subset.csv")

In [35]:
def sanity_check(df, df_sub, fos): 
    df_paperids = df[["PaperId"]].drop_duplicates()
    #df_sub_paperids = df_sub[["PaperId"]].drop_duplicates()
    # overlap should be all of subset 
    overlap = df_sub.merge(df_paperids, on = "PaperId", how = "inner").drop_duplicates()
    # overlap (& subset) should be the same as filtering
    # should become more or less equal to "subset" if we also do by age...
    filtering = len(df[(df["NormalizedName"] == f'{fos}') & 
                       (df["DocType"] == 'Journal') & 
                       (df["Date"] >= datetime.date(2010, 1, 1)) & 
                       (df["Date"] <= datetime.date(2021, 1, 1))])
    print(f"total papers {fos}: \n{len(df)} \n")
    print(f"subset papers {fos}: \n{len(df_sub)} \n")
    print(f"overlap: \n{len(overlap)} \n")
    print(f"by filtering: \n{filtering}")

In [36]:
sanity_check(psych, psych_sub, "psychology")

total papers psychology: 
4757436 

subset papers psychology: 
1888024 

overlap: 
1888024 

by filtering: 
1888024


In [37]:
sanity_check(econ, econ_sub, "economics")

total papers economics: 
1970876 

subset papers economics: 
582689 

overlap: 
582689 

by filtering: 
582689


In [38]:
sanity_check(soc, soc_sub, "sociology")

total papers sociology: 
2414270 

subset papers sociology: 
953341 

overlap: 
953341 

by filtering: 
953341


In [39]:
sanity_check(pol, pol_sub, "political science")

total papers political science: 
1794879 

subset papers political science: 
768704 

overlap: 
768704 

by filtering: 
768704


# authorships

In [74]:
psych_paa = pd.read_csv(f"{inpath}psychology_paper_author.csv")
econ_paa = pd.read_csv(f"{inpath}economics_paper_author.csv")
pol_paa = pd.read_csv(f"{inpath}politicalscience_paper_author.csv")
soc_paa = pd.read_csv(f"{inpath}sociology_paper_author.csv")

In [75]:
# psychology slightly larger studies on avg. driven by outliers?
print(f"psychology: \nauthorships: {len(psych_paa)} \nstudies: {len(psych)} \navg: {len(psych_paa)/len(psych)} \n")
print(f"economy: \nauthorships: {len(econ_paa)} \nstudies: {len(econ)} \navg: {len(econ_paa)/len(econ)} \n")
print(f"political science: \nauthorships: {len(pol_paa)} \nstudies: {len(pol)} \navg: {len(pol_paa)/len(pol)} \n")
print(f"sociology: \nauthorships: {len(soc_paa)} \nstudies: {len(soc)} \navg: {len(soc_paa)/len(soc)} \n")

psychology: 
authorships: 17881799 
studies: 4757436 
avg: 3.758705109222699 

economy: 
authorships: 5228483 
studies: 1970876 
avg: 2.652872631256355 

political science: 
authorships: 4651242 
studies: 1794879 
avg: 2.591395854539498 

sociology: 
authorships: 5493290 
studies: 2414270 
avg: 2.275342028853442 

