In [None]:
'''
VMP 2022-02-08: 
key document for preprocessing main scientific fields. 
'''

In [1]:
import sys  
sys.path.insert(0, '/home/vicp')
from MAGsparkmasters import get_mag_with_cluster_connection
from MAGmasters import MicrosoftAcademicGraph
import os
from pyspark.sql import functions as F, Window
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import numpy as np
from pyspark.sql.functions import avg
os.chdir('/home/vicp')
mag, spark = get_mag_with_cluster_connection(65953, 
                               memory_per_executor=16000)

['NAME STATE JOBID', '1.train.0 RUNNING 65946', 'cluster_new.job RUNNING 65953', 'jupyter-notebook RUNNING 65952', '4.train.0 RUNNING 65958', '4.train.1 RUNNING 65959', '4.train.2 RUNNING 65960', '4.train.3 RUNNING 65961', '4.train.4 RUNNING 65962', '4.train.5 RUNNING 65963', '']


In [2]:
spark

# NormalizedName
Create file called *FoS.txt*: <br/>
File contains *COLUMNS*: 
* FieldOfStudyId <int>
* NormalizedName <str>
* Level <int>

In [3]:
def NormalizedName(): 
    
    # NormalizedName and FieldOfStudyId mapping. 
    FoS = mag.getSubset("FieldsOfStudy", ['FieldOfStudyId', 'NormalizedName', 'Level']) \
        .distinct() 
    print(f"dtypes: {FoS.dtypes}")
    
    # write file 
    mag.saveFile(FoS, "masters", "FoS.txt")

In [4]:
NormalizedName()

dtypes: [('FieldOfStudyId', 'bigint'), ('NormalizedName', 'string'), ('Level', 'int')]


# Subset by FoS
based on a list of fields-of-study, we create subsets.

In [5]:
focus_list = ['sociology', 'political science', 'psychology', 'economics']

In [6]:
focus_fos = mag.getDataframe('FoS') \
    .filter(F.col('NormalizedName').isin(focus_list)) \
    .toPandas()

In [7]:
focus_fos.head()

Unnamed: 0,FieldOfStudyId,NormalizedName,Level
0,162324750,economics,0
1,17744445,political science,0
2,144024400,sociology,0
3,15744967,psychology,0


In [8]:
## one way of doing it ##
# could just inner join & take NormalizedName instead. 
# actually, at this point we only need PaperId. 

In [9]:
# create dataframes 
def fos_papers(fos, name): 
    # quick preprocessing
    pfs = mag.getDataframe('PaperFieldsOfStudy') \
        .filter(F.col('FieldOfStudyId').isin(fos)) \
        .select('PaperId') \
        .distinct() 
    print(f"dtypes {name}: {pfs.dtypes}")
    # write file 
    mag.saveFile(pfs, "masters", f"{name}_papers.txt")

In [10]:
for index, row in focus_fos.iterrows():
    fos_papers(row['FieldOfStudyId'], row['NormalizedName'].replace(" ", ""))

dtypes economics: [('PaperId', 'bigint')]
dtypes politicalscience: [('PaperId', 'bigint')]
dtypes sociology: [('PaperId', 'bigint')]
dtypes psychology: [('PaperId', 'bigint')]


# Subset by Year (2010-2021) and DocType (Journal)
including 2010 and and 2020. (not including 2021). <br/>
only doctype journal. <br/>
subset needed to filter PaperReferences. 
Here, again - we do not need FieldOfStudyId. 

In [11]:
### this works, but two thigns 
# (1) we do not need FieldOfStuyId - and
# (2) we need to write this as a .csv as well

In [12]:
def subset_data(fos): 
    papers = mag.getDataframe('Papers') # PaperId, DocType, Date, FamilyId. 
    fos_papers = mag.getDataframe(f'{fos}_papers') # FieldOfStudyID, PaperId
    
    # filter papers first - core data set. 
    sub = papers.filter(F.col('DocType') == 'Journal') \
        .filter((F.col('Date') >= datetime.date(2010, 1, 1)) & (F.col('Date') <= datetime.date(2021, 1, 1))) \
        .join(fos_papers, ['PaperId'], 'inner') \
        .select('PaperId') \
        .distinct()
    print(f"{fos} dtypes: {sub.dtypes}")
    
    mag.saveFile(sub, "masters", f"{fos}_subset.txt")
    sub.toPandas().to_csv(f"/home/vicp/data/2021-08-02/masters/{fos}_subset.csv", index=False)

In [13]:
focus_list = ['sociology', 'politicalscience', 'psychology', 'economics']
for fos in focus_list:
    subset_data(fos)

sociology dtypes: [('PaperId', 'bigint')]
politicalscience dtypes: [('PaperId', 'bigint')]
psychology dtypes: [('PaperId', 'bigint')]
economics dtypes: [('PaperId', 'bigint')]


In [14]:
## write to csv ## 
# really should delete FieldOfStudyId

# PaperReferences (v2)

In [3]:
def get_citation_reference(fos):
    '''
    ## input
    fos: field of study (without space)
    
    ## output / save
    {fos}_reference_or_citation.txt: <folder> one column of PaperId that either is reference or citation of fos. 
    {fos}_citation: <csv> the subset of PaperReferences (two columns) -- where the fos cites other articles. 
    {fos}_reference: <csv> the subset of PaperReferences (two columns) -- where the fos are references by other articles.
    '''
    
    # load files
    pr = mag.getDataframe('PaperReferences')
    fos_subset = mag.getDataframe(f'{fos}_subset')
    
    # join on both sites 
    ## the papers that our focus papers cite
    #citation = fos_subset.join(pr, ["PaperId"], "inner") \
    #    .distinct()
    
    ## the papers that cite our focus papers
    ## i.e. our papers have to be PaperReferenceId. 
    reference = fos_subset.withColumnRenamed('PaperId', 'PaperReferenceId') \
        .join(pr, ['PaperReferenceId'], "inner") \
        .distinct()
    
    # get unique papers that we care about
    #citation_col1 = citation.select('PaperReferenceId').distinct().withColumnRenamed('PaperReferenceId', 'PaperId')
    #citation_col2 = citation.select('PaperId').distinct()

    #reference_col2 = reference.select('PaperId').distinct() 

    # join them (this is crazy, right?)
    #papers_reference_or_citation = citation_col1.join(citation_col2, ["PaperId"], "outer") \
    #    .join(reference_col1, ["PaperId"], "outer") \
    #    .join(reference_col2, ["PaperId"], "outer") \
    #    .distinct()
    
    ## this should give us all papers (we will gather meta-data for this): 
    # (1) FoS: from particular field of study (from fos_subset)
    # (2) those that have selected field of study as reference (from reference)
    # need to combine both to get (1) references, but also (2) psychology papers with no citations. 
    papers_total = reference.select('PaperId').join(fos_subset, 'PaperId', 'outer').distinct()

    # subset with both the papers that reference & the original papers that have NO references. 
    
    # save stuff
    #print(f"{fos} reference_or_citation: {papers_reference_or_citation.dtypes}")
    #mag.saveFile(papers_reference_or_citation, "masters", f"{fos}_reference_or_citation.txt")
    #citation.toPandas().to_csv(f"/home/vicp/data/2021-08-02/masters/{fos}_citation.csv", index = False)
    print(f"{fos} papers_total: {papers_total.dtypes}")
    mag.saveFile(papers_total, "masters", f"{fos}_total.txt")
    reference.toPandas().to_csv(f"/home/vicp/data/2021-08-02/masters/{fos}_reference.csv", index = False)

In [4]:
focus_list = ['psychology', 'sociology', 'politicalscience', 'economics']
for fos in focus_list:
    get_citation_reference(fos)

psychology papers_total: [('PaperId', 'bigint')]
sociology papers_total: [('PaperId', 'bigint')]
politicalscience papers_total: [('PaperId', 'bigint')]
economics papers_total: [('PaperId', 'bigint')]


# Add Field Of Study to papers we care about

# metadata
we add metadata to the subsetted fields

In [9]:
def get_metadata_papers(fos):
    
    # load files 
    papers = mag.getSubset('Papers', ['PaperId', 'PaperTitle', 'DocType', 'Date', 'Year']) # PaperId, DocType, Date, FamilyId. 
    fos_papers = mag.getDataframe(f'{fos}_total') # PaperId
    pfs = mag.getDataframe('PaperFieldsOfStudy') # couples PaperId and FieldOfStudyId
    FoS = mag.getDataframe('FoS') # couples FieldOfStudyId and NormalizedName
    
    # filter papers first - core data set. 
    fos_papers.join(papers, ['PaperId'], 'inner') \
        .join(pfs, ['PaperId'], 'inner') \
        .join(FoS, ['FieldOfStudyId'], 'inner') \
        .filter(F.col('Level') == 0) \
        .select('PaperId', 'PaperTitle', 'Date', 'DocType', 'NormalizedName') \
        .distinct() \
        .toPandas() \
        .to_csv(f"/home/vicp/data/2021-08-02/masters/{fos}_paper_meta.csv", index = False)
    print(f"-- finished processing {fos}")
    

In [10]:
focus_list = ['psychology', 'sociology', 'politicalscience', 'economics']
for fos in focus_list: 
    get_metadata_papers(fos)

-- finished processing psychology
-- finished processing sociology
-- finished processing politicalscience
-- finished processing economics


In [3]:
def get_paper_author(fos): 
    # load relevant data
    fos_papers = mag.getDataframe(f'{fos}_total') # PaperId
    author_affiliations = mag.getSubset('PaperAuthorAffiliations', ['PaperId', 'AuthorId']) # PaperId, AuthorId. 
    
    # inner join
    fos_papers.join(author_affiliations, ['PaperId'], "inner") \
        .distinct() \
        .toPandas() \
        .to_csv(f"/home/vicp/data/2021-08-02/masters/{fos}_paper_author.csv", index = False)
    print(f"-- finished processing {fos}")

In [4]:
focus_list = ['psychology', 'sociology', 'politicalscience', 'economics']
for fos in focus_list: 
    get_paper_author(fos)

-- finished processing psychology
-- finished processing sociology
-- finished processing politicalscience
-- finished processing economics
