In [118]:
import json
import numpy as np
from collections import defaultdict
import os
import datetime

In [119]:
if os.getcwd().endswith('pagerank'):
    os.chdir('..')

In [120]:
with open('pagerank/pageranks.json', 'r') as file:
    pageranks = json.load(file)

with open('pagerank/pub_dates.json', 'r') as file:
    pub_dates = json.load(file)

with open('data/tira_documents_retrieved.json', 'r') as file:
    retrieved_papers_info = json.load(file)

In [121]:
print(len(pub_dates))
print(len(pageranks))
print(len(retrieved_papers_info))

104397
538561
126958


In [122]:
# Omit the entries in retrieved_papers_info where the value is None
retrieved_papers_info = {tira_id: v for tira_id, v in retrieved_papers_info.items() if v is not None}


In [123]:
# Convert the pub_dates from string to datetime64
def convert_date(date_str):
    if date_str is None:
        return None
    return np.datetime64(date_str)

pub_dates = {paperId: convert_date(date_str) for paperId, date_str in pub_dates.items()}

print(pub_dates["ec87bf9b1423a6598d0ea43d7fb9f6db0fd6305b"])

2018-07-01


In [124]:
# Find the year with the most publications
years = defaultdict(int)
for date in pub_dates.values():
    if date is not None:
        years[date.astype(datetime.datetime).year] += 1

max_pubs_year = max(years, key=lambda x: years[x])
print(max_pubs_year)

2020


In [125]:
# Impute missing values for pagerank and publication date
imputed_pagerank = np.mean(list(pageranks.values())) # Use mean pagerank

imputed_pub_date = np.datetime64(f'{int(max_pubs_year)}-01-01') # Use 1. Jan of most published year
print(imputed_pub_date)

2020-01-01


In [126]:
 # Swap the tiraId and paperId (Id from the Semantic Scholar API) of the retrieved_papers_info dictionary
tiraId_to_pagerank = defaultdict(int)  
tiraId_to_pub_date = defaultdict(np.datetime64)

for tiraId, value in retrieved_papers_info.items():
    paperId = value['paperId']
    
    if paperId in pageranks:
        tiraId_to_pagerank[tiraId] = pageranks[paperId]
    else:
        tiraId_to_pagerank[tiraId] = imputed_pagerank

    if paperId in pub_dates:
        tiraId_to_pub_date[tiraId] = pub_dates[paperId]
    else:
        tiraId_to_pub_date[tiraId] = imputed_pub_date

In [127]:
def discount_pagerank(pub_date, pagerank):
    '''
    Discount the pagerank by 10% for each year since publication
    '''
    current_date = np.datetime64("today")

    age = current_date - pub_date # age in days
    age  = age.astype(int) / 365.25 # age in years

    return (0.9 ** age) * pagerank

In [128]:
with open('data/tira_documents.json', 'r') as file:
    tira_documents = json.load(file)

tira_documents = {item['docno']: item['text'] for item in tira_documents} # Convert the list of dictionaries to a dictionary

docs_with_infos = defaultdict(dict)

for tira_id, text in tira_documents.items():
    title = text.split('\n')[0]

    if len(text.split('\n')) > 2:
        abstract = text.split('\n')[3]
    else:
        abstract = None
        
    if abstract == '':
        abstract = None

    if abstract is None:
        try:
            abstract = retrieved_papers_info[tira_id]['abstract'] # Insert abstract from the retrieved_papers_info

            if abstract == None:
                abstract = ""
            
        except:
            abstract = ""
    try:
        pagerank = tiraId_to_pagerank[tira_id] # Insert pagerank
        
        if pagerank == 0:
            pagerank = imputed_pagerank
        
    except:
        pagerank = imputed_pagerank

    try:
        pub_date = tiraId_to_pub_date[tira_id] # Insert pagerank
        
    except:
        pub_date = imputed_pub_date

    if pub_date == None or pub_date == np.datetime64('NaT') or np.isnat(pub_date):
        pub_date = imputed_pub_date
    

    docs_with_infos[tira_id]['title'] = title
    docs_with_infos[tira_id]['abstract'] = abstract
    docs_with_infos[tira_id]['pagerank'] = pagerank
    docs_with_infos[tira_id]['discounted_pagerank'] = discount_pagerank(pub_date, pagerank)
    docs_with_infos[tira_id]['pub_date'] = str(pub_date)

In [129]:
print("Missing abstracts:", len([doc for doc in docs_with_infos.values() if doc['abstract'] == ""]))

Missing abstracts: 14104


In [130]:
print(docs_with_infos['L02-1310'])

{'title': 'Bootstrapping Large Sense Tagged Corpora', 'abstract': 'The performance of Word Sense Disambiguation systems largely depends on the availability of sense tagged corpora. Since the semantic annotations are usually done by humans, the size of such corpora is limited to a handful of tagged texts. This paper proposes a generation algorithm that may be used to automatically create large sense tagged corpora. The approach is evaluated through comparative sense disambiguation experiments performed on data provided during the SENSEVAL-2 English all words and English lexical sample tasks.', 'pagerank': 3.489394849931522e-06, 'discounted_pagerank': 3.343848786101388e-07, 'pub_date': '2002-05-01'}


In [131]:
# Save the docs_with_infos
with open('data/docs_with_all_info.json', 'w') as file:
    json.dump(docs_with_infos, file)

In [132]:
print(len(docs_with_infos))

126958
