In [35]:
import json
import numpy as np
from collections import defaultdict
import os

In [36]:
if os.getcwd().endswith('pagerank'):
    os.chdir('..')

In [37]:
with open('pagerank/pageranks.json', 'r') as file:
    pageranks = json.load(file)

with open('data/tira_documents_retrieved.json', 'r') as file:
    retrieved_papers_info = json.load(file)

In [38]:
# Omit the entries in retrieved_papers_info where the value is None
retrieved_papers_info = {tira_id: v for tira_id, v in retrieved_papers_info.items() if v is not None}


In [39]:
# Impute the pagerank
imputed_pagerank = np.mean(list(pageranks.values()))

In [40]:
 # Swap the tiraId and paperId (Id from the Semantic Scholar API) of the retrieved_papers_info dictionary
tiraId_to_pagerank = defaultdict(int)  

for tiraId, value in retrieved_papers_info.items():
    paperId = value['paperId']
    
    if paperId in pageranks:
        tiraId_to_pagerank[tiraId] = pageranks[paperId]
    else:
        tiraId_to_pagerank[tiraId] = imputed_pagerank

In [41]:
with open('data/tira_documents.json', 'r') as file:
    tira_documents = json.load(file)

tira_documents = {item['docno']: item['text'] for item in tira_documents} # Convert the list of dictionaries to a dictionary

docs_with_infos = defaultdict(dict)

for tira_id, text in tira_documents.items():
    title = text.split('\n')[0]

    if len(text.split('\n')) > 2:
        abstract = text.split('\n')[3]
    else:
        abstract = None
        
    if abstract == '':
        abstract = None

    if abstract is None:
        try:
            abstract = retrieved_papers_info[tira_id]['abstract'] # Insert abstract from the retrieved_papers_info

            if abstract == None:
                abstract = ""
            
        except:
            abstract = ""
    try:
        pagerank = tiraId_to_pagerank[tira_id] # Insert pagerank
        
        if pagerank == 0:
            pagerank = imputed_pagerank
        
    except:
        pagerank = imputed_pagerank


    docs_with_infos[tira_id]['title'] = title
    docs_with_infos[tira_id]['abstract'] = abstract
    docs_with_infos[tira_id]['pagerank'] = pagerank

In [42]:
print("Missing abstracts:", len([doc for doc in docs_with_infos.values() if doc['abstract'] == ""]))

Missing abstracts: 14108


In [43]:
print(docs_with_infos['L02-1310'])

{'title': 'Bootstrapping Large Sense Tagged Corpora', 'abstract': 'The performance of Word Sense Disambiguation systems largely depends on the availability of sense tagged corpora. Since the semantic annotations are usually done by humans, the size of such corpora is limited to a handful of tagged texts. This paper proposes a generation algorithm that may be used to automatically create large sense tagged corpora. The approach is evaluated through comparative sense disambiguation experiments performed on data provided during the SENSEVAL-2 English all words and English lexical sample tasks.', 'pagerank': 3.489394849931522e-06}


In [44]:
# Save the docs_with_infos
with open('data/docs_with_all_info.json', 'w') as file:
    json.dump(docs_with_infos, file)

In [45]:
print(len(docs_with_infos))

126958
