In [None]:
import pandas as pd
from tqdm import tqdm
import time
import requests
from IPython.display import clear_output

from semanticscholar import SemanticScholar

sch = SemanticScholar(timeout=2)

In [None]:
papers_df = pd.read_csv('../Data/Papers - After Content Filter.csv')
influent_list = []

## Get Influential References from Semantic Scholar

In [None]:
papers_df['SSCitations'] = 0
c = 0
for index, row in tqdm(papers_df.iterrows(), total=papers_df.shape[0]):
    paper = sch.paper(row['DOI'])

    papers_df.loc[index,'SSCitations'] = len(paper['citations'])

    infl_count = 0
    for ref in paper['references']:
        if ref['isInfluential'] and ref['doi'] is not None:
            influent_list.append({'Orig DOI': row['DOI'],'Ref DOI': ref['doi'], 'Ref Title': ref['title']})
            infl_count += 1
    
    c+=1
    if c%50==0:
        print('Waiting 150 seconds...')
        time.sleep(150)

papers_df.to_csv('../Data/Papers - After Content Filter with SSCitations.csv', index=False)

In [None]:
influent_df = pd.DataFrame(influent_list)

print(f"Total Influential References: {influent_df.shape[0]}")
print(f"Unique Influential References: {len(influent_df['Ref DOI'].unique())}")

## Get Info of Influential References

In [None]:
sch = SemanticScholar(timeout=5)

df = influent_df.drop('Orig DOI', axis=1)
df = df[df.duplicated(keep='first')==False]

ref_list = []

c = 0
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        paper = sch.paper(row['Ref DOI'])
    except requests.exceptions.ReadTimeout:
        print(f"ReadTimeout occured at id {index}")
        continue

    try:
        ref_list.append({'Title': paper['title'],
                        'DOI': paper['doi'], 
                        'Topics': paper['topics'],
                        'Venue': paper['venue'],
                        'Year': paper['year'],
                        'Abstract': paper['abstract']})
    except:
        print(f"Error in {index}")
    
    c+=1
    if c%50==0:
        print('Waiting 150 seconds...')
        time.sleep(150)


In [None]:
ref_df = pd.DataFrame(ref_list)
new_papers = list(set(ref_df['DOI'])-set(papers_df['DOI']))
ref_df = ref_df[ref_df['DOI'].isin(new_papers)]
ref_df.to_csv('../Data/Influential papers.csv')
ref_df

## Filter Influential References by Title and Abstract

In [None]:
ref_df['Remove'] = None

for index, row in ref_df.iterrows():
    print()
    print(f"Title: {row['Title']}")
    print()
    print(f"Topics:")
    print(row['Topics'])
    print()
    print('Abstract: ')
    print(row['Abstract'])

    time.sleep(2)
    remove = input("Remove this paper? (0: No, 1:Yes)")

    if remove=='stop':
        clear_output(wait=True)
        break
    else:
        ref_df.loc[index,'Remove'] = int(remove)

    clear_output(wait=True)

In [None]:
ref_df = ref_df[ref_df['Remove']==0]
ref_df.to_csv('../Data/Influential papers.csv', index=False)

## Agregate New Papers

In [None]:
ref_df = pd.read_csv('../Data/Influential papers.csv')
ref_df = ref_df[ref_df['keep']==True]
ref_df['Publisher'] = ref_df['Venue']
ref_df = ref_df.drop(['Topics','Remove','keep','Venue','id'], axis=1)

In [None]:
papers_df = pd.read_csv('../Data/Papers - After Content Filter with SSCitations.csv')
papers_df = papers_df.drop('Unnamed: 0', axis=1)
papers_df = papers_df.append(ref_df)

papers_df.to_csv('../Data/Papers - After Snowballing.csv', index=False)