In [None]:
import pyalex
from pyalex import Sources
from pyalex import Works
import itertools
import pandas as pd

In [None]:
# Give email to use polite pool, otherwise leave blank
pyalex.config.email = input()

## Get sources

In [None]:
field = "techlaw"
journals = pd.read_csv("journals/" + field + ".csv")

In [None]:
len(journals)

In [None]:
# Get OA sources by journal issn
# Iterate journals in smaller chunks, otherwise seems not to return all results
chunks = [list(journals['issn_l'])[i:i+10] for i in range(0,len(journals),10)]

sources = []
for c in chunks:
    query = "|".join( c )
    results = Sources().filter(issn=query).get()
    sources += results

len(sources)

#### Check and drop duplicates

In [None]:
# Turn to sources dataframe for duplicate removal
sources = pd.DataFrame(sources)

In [None]:
sources[sources.duplicated(subset='issn_l', keep=False)]

In [None]:
sources.drop_duplicates(subset='issn_l', keep="last", inplace=True)
len(sources)

#### Check missing journals

In [None]:
# Turn back to dicts
sources = sources.to_dict('records')

In [None]:
issn_list = [s['issn_l'] for s in sources]
missing = journals[journals['issn_l'].apply( lambda issn: issn not in issn_list )]
missing

In [None]:
# Try to get missing journals by title
missing_titles = list(missing['journal'])

for title in missing_titles:
    results = Sources().filter(display_name={"search":title}).get()
    sources += results
    if results:
        missing = missing[missing['journal'] != title]
        
# Write missing journals to file
missing.to_csv("data/" + field + "_missing.csv", index=False)
missing

In [None]:
# Should be True
len(sources) + len(missing) == len(journals)

print(len(sources), "sources in total for", field)

## Get works

In [None]:
s_ids_names = {s['id'].replace('https://openalex.org/','') : s['display_name'] for s in sources}

In [None]:
import time
works = []
for s_id in s_ids_names.keys():
    print("Getting source:", s_ids_names[s_id])
    paginator = Works().filter(primary_location={"source" : {"id" : s_id}}).paginate(per_page=200)
    works += list(itertools.chain.from_iterable(paginator))
    time.sleep(30)

In [None]:
df = pd.DataFrame(works)
df['abstract'] = [w['abstract'] for w in works]

In [None]:
df.to_csv("data/" + field + ".csv", index=False)
df.to_json("data/" + field + ".json")