In [None]:
import pyalex
from pyalex import Authors
from pyalex import Works
import os
import itertools
import pandas as pd
import pickle
import time

In [None]:
# Give email to use polite pool, otherwise leave blank
email = input()
pyalex.config.email = email

In [None]:
data_dir = "data/"
source_dir = "sources/"

if not os.path.exists(data_dir): os.makedirs(data_dir)
if not os.path.exists(source_dir): os.makedirs(source_dir)

## Download OA author ids using names

In [None]:
authors_sources = pd.read_csv(source_dir + input() + ".csv")
authors_sources

In [None]:
# Get lists of author objects grouped by display name
author_objects = []

for a in authors_sources['name']:
    print(a)
    author_objects.append( Authors().search_filter(display_name=a).get() )

In [None]:
# Get author names and OA ids and save to list – need to check these by hand!

authors_ids = []

for author_list in authors_objects:   
    for author in author_list:
        authors_ids.append( (author['display_name'], author['id'], author['relevance_score'])  )
        
authors_ids_df = pd.DataFrame( authors_ids, columns=['name', 'id', 'relevance'] )

authors_ids_df

In [None]:
# Save to csv for hand-checking
authors_ids_df.to_csv(source_dir + 'authors_ids_unchecked.csv')

## Download works using OA author ids

In [None]:
authors_ids = pd.read_csv(source_dir + input() + ".csv")
authors_ids

In [None]:
authors_works = {}

for i, row in authors_ids.iterrows():
    
    author_name = row[0]
    author_id = row[1].replace('https://openalex.org/', '')
    
    print(i, author_name, 'https://openalex.org/' + author_id)
    
    paginator = Works().filter( authorships={"author" : {'id' : author_id}} ).paginate(per_page=200)
    works = list(itertools.chain.from_iterable(paginator))
    
    if author_name not in authors_works:
        authors_works[author_name] = {author_id : works}
    else:
        authors_works[author_name][author_id] = works
        
    time.sleep(10)

In [None]:
print(len(authors_works.keys()))
print( authors_works.keys() )

In [None]:
# Dump works as dict
with open(data_dir + 'authors_works.p', 'wb') as fp:
    pickle.dump(authors_works, fp, protocol=pickle.HIGHEST_PROTOCOL)

## List works by css author and references by css paper

In [None]:
with open(data_dir + 'authors_works.p', 'rb') as fp:
    authors_works = pickle.load(fp)

authors_works.keys()

In [None]:
works_list = []
works_references = {}

for author_name in authors_works:
    
    # For authors with multiple OA ids, use just one in data
    author_common_id = list( authors_works[author_name].keys() )[0]
    
    # Iterate over author OA ids and works
    for author_orig_id in authors_works[author_name]:           
        
        for work in authors_works[author_name][author_orig_id]:
            
            # Get OA id of the publication source if exists
            try: source_id = work['primary_location']['source']['id']
            except: source_id = ''
            
            work_id = work['id'].replace('https://openalex.org/', '')
                
            # Create work data tuple
            d = (
                author_orig_id, # Store original OA author id
                author_common_id, 
                author_name,
                work_id,
                '', # For storing paper family id
                work['doi'],
                work['title'],
                work['type'],
                source_id.replace('https://openalex.org/', ''),
                work['publication_date'],
                work['publication_year'],
                work['cited_by_count']
            )
            
            works_list.append( d )
            
            # Store references by work id
            works_references[work_id] = work['referenced_works']

In [None]:
# Unique papers by css authors
len( works_references )

In [None]:
works_df = pd.DataFrame(works_list, columns = [
    'AuthorOrigId','AuthorId','AuthorName','PaperId','FamilyId','Doi',
    'PaperTitle','DocType','SourceId','Date','Year','CitationCount'
])

works_df

In [None]:
works_df.to_csv(data_dir + 'cssAuthorPapers.csv', index=False)

In [None]:
# Dump references as dict
with open(data_dir + 'works_references.p', 'wb') as fp:
    pickle.dump(works_references, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Get references by css paper

In [None]:
with open(data_dir + 'works_references.p', 'rb') as fp:
    works_references = pickle.load(fp)

In [None]:
# References count
ref_ids = [ref.replace('https://openalex.org/', '') for refs in works_references.values() for ref in refs]
unique_ref_ids = list( set( ref_ids ) )

print( len( ref_ids ) )
print( len( unique_ref_ids ) )

In [None]:
# Dump unique reference ids 
with open(data_dir + 'unique_references_ids.p', 'wb') as fp:
    pickle.dump(unique_ref_ids, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# https://github.com/eschares/OpenAlex-CitedReferences/blob/main/notebooks/1-Pull_the_data_OpenAlex-citedreferences.ipynb
import requests

def get_references(reference_ids, chunk_size, mailto):
    session = requests.Session()
    
    for i in range(0, len(reference_ids), chunk_size):
        chunk = reference_ids[i:i + chunk_size]
        
        query = "|".join( chunk )
        
        api_url = 'https://api.openalex.org/works?filter=openalex:' + query
        api_url += '&per_page=' + str( chunk_size ) + '&mailto=' + mailto
                
        yield session.get(api_url).json()['results']

## Download unique references

In [None]:
count = len(unique_ref_ids)
per_page = 50
number_of_pages_needed = int(count / per_page) + (count % per_page > 0)
print(f"number of requests needed (with per_page set to {per_page}): {number_of_pages_needed}")

In [None]:
%%time

unique_references_data = []
results_per_page = get_references(unique_ref_ids, 50, email)

# call OpenAlex API 
for i, results in enumerate(results_per_page):
    
    if i % 100 == 0: print(f'{i} requests sent')
        
    for work in results:
        
        # Get OA id of the publication source if exists
        try: source_id = work['primary_location']['source']['id']
        except: source_id = ''
        
        # Create reference data tuple
        r = (
            work['id'].replace('https://openalex.org/', ''),
            work['doi'],
            work['title'],
            work['type'],
            source_id.replace('https://openalex.org/', ''),
            work['publication_date'],
            work['publication_year'],
            work['cited_by_count']
        )
        
        unique_references_data.append( r )

In [None]:
print( len(unique_references_data), "/", len(unique_ref_ids) )

In [None]:
# Dump unique references as list
with open(data_dir + 'unique_references_data.p', 'wb') as fp:
    pickle.dump(unique_references_data, fp, protocol=pickle.HIGHEST_PROTOCOL)

### Try to get missing refs one by one

In [None]:
with open(data_dir + 'unique_references_ids.p', 'rb') as fp:
    unique_ref_ids = pickle.load(fp)

with open(data_dir + 'unique_references_data.p', 'rb') as fp:
    unique_references_data = pickle.load(fp)

In [None]:
missing_ref_ids = []

ref_ids = set( [ref[0] for ref in unique_references_data] )

for ref in unique_ref_ids:
    if ref not in ref_ids:
        missing_ref_ids.append( ref )

len( missing_ref_ids )

In [None]:
%%time

missing_references = []
errors = []

for i, ref_id in enumerate(missing_ref_ids):
    
    if i % 10 == 0: print(f'{i} requests sent with', len(errors), 'errors', end='\r' )
    
    try: 
        work = Works()[ref_id]
    except: 
        errors.append(ref_id)
        continue
    
    if work['title'] == 'Deleted Work':
        continue
    
    # Get OA id of the publication source if exists
    try: source_id = work['primary_location']['source']['id']
    except: source_id = ''
        
    # Create reference data tuple
    r = (
        ref_id,
        work['doi'],
        work['title'],
        work['type'],
        source_id.replace('https://openalex.org/', ''),
        work['publication_date'],
        work['publication_year'],
        work['cited_by_count']
    )
        
    missing_references.append( r )

In [None]:
len( missing_references )

In [None]:
unique_references_data = unique_references_data + missing_references

In [None]:
print( len(unique_references_data), '/', len(unique_ref_ids) )

In [None]:
# Dump references as list
with open(data_dir + 'unique_references_data.p', 'wb') as fp:
    pickle.dump(unique_references_data, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Create full references dataset by combining css paper ids with referencing paper data

In [None]:
with open(data_dir + 'works_references.p', 'rb') as fp:
    works_references = pickle.load(fp)

with open(data_dir + 'unique_references_data.p', 'rb') as fp:
    unique_references_data = pickle.load(fp)

In [None]:
# Create temp dictionary for querying css works that cite referenced papers
refs_works = {}

for paper_id in works_references:
    
    for ref_id in works_references[paper_id]:
        
        ref_id = ref_id.replace('https://openalex.org/', '')
        
        if ref_id not in refs_works:
            refs_works[ref_id] = [paper_id]
        else:
            refs_works[ref_id].append(paper_id)

In [None]:
full_references_data = []

for ref in unique_references_data:
    
    citing_work_ids = refs_works[ref[0]]
    
    for citing_id in citing_work_ids:
        
        full_references_data.append( ref + (citing_id,) )

In [None]:
full_refs_df = pd.DataFrame( full_references_data, columns = [
    'PaperId','Doi','PaperTitle','DocType','SourceId','Date','Year','CitationCount','PaperCitedId'
] )

full_refs_df['FamilyId'] = ''

full_refs_df = full_refs_df[[
    'PaperCitedId','PaperId','FamilyId','Doi','PaperTitle','DocType','SourceId','Date','Year','CitationCount'
]]

full_refs_df

In [None]:
full_refs_df.to_csv(data_dir + 'papersReferredToByCssAuthors.csv', index=False)