In [1]:
import pyalex
from pyalex import Sources
from pyalex import Works
import os
import itertools
import pandas as pd

In [2]:
# Give email to use polite pool, otherwise leave blank
pyalex.config.email = input()

juho.paakkonen@helsinki.fi


In [3]:
data_dir = input()
if not os.path.exists(data_dir): os.makedirs(data_dir)

data/


## Get sources

In [4]:
field = input()
journals = pd.read_csv("journals/" + field + ".csv")

techlaw


In [5]:
len(journals)

37

In [6]:
# Get OA sources by journal issn
# Iterate journals in smaller chunks, otherwise seems not to return all results
chunks = [list(journals['issn_l'])[i:i+10] for i in range(0,len(journals),10)]

sources = []
for c in chunks:
    query = "|".join( c )
    results = Sources().filter(issn=query).get()
    sources += results

len(sources)

33

#### Check and drop duplicates

In [7]:
# Turn sources to dataframe for duplicate removal
sources = pd.DataFrame(sources)

In [8]:
sources[sources.duplicated(subset='issn_l', keep=False)]

Unnamed: 0,id,issn_l,issn,display_name,host_organization,host_organization_name,host_organization_lineage,works_count,cited_by_count,summary_stats,...,country_code,societies,alternate_titles,abbreviated_title,type,x_concepts,counts_by_year,works_api_url,updated_date,created_date
5,https://openalex.org/S146518762,1757-7632,"[1757-7632, 1757-7640]",The Journal of Media Law,https://openalex.org/P4310320547,Taylor & Francis,[https://openalex.org/P4310320547],267,741,"{'2yr_mean_citedness': 0.0, 'h_index': 1, 'i10...",...,,[],[],,journal,[],"[{'year': 2023, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=primary_...,2023-04-25T21:47:25.571117,2016-06-24
6,https://openalex.org/S4210185934,1757-7632,"[1757-7640, 1757-7632]",Journal Of Media Law,https://openalex.org/P4310320547,Taylor & Francis,[https://openalex.org/P4310320547],266,741,"{'2yr_mean_citedness': 0.5217391304347826, 'h_...",...,GB,[],[],,journal,"[{'id': 'https://openalex.org/C17744445', 'wik...","[{'year': 2023, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=primary_...,2023-04-26T00:28:20.756622,2022-02-03


In [9]:
sources.drop_duplicates(subset='issn_l', keep="last", inplace=True)
len(sources)

32

#### Check missing journals

In [10]:
# Turn back to dicts
sources = sources.to_dict('records')

In [11]:
issn_list = [s['issn_l'] for s in sources]
missing = journals[journals['issn_l'].apply( lambda issn: issn not in issn_list )]
missing

Unnamed: 0,issn_l,journal
3,0897-1277,Jurimetrics
7,2666-139X,Technology and Regulation
19,2736-4321,Journal of Cross-Disciplinary Research in Comp...
20,1543-8899,Journal of Telecommunications and High Technol...
28,1941-5788,Journal of Business and Technology Law


In [12]:
# Try to get missing journals by title
missing_titles = list(missing['journal'])

for title in missing_titles:
    results = Sources().filter(display_name={"search":title}).get()
    sources += results
    if results:
        missing = missing[missing['journal'] != title]
        
# Write missing journals to file
missing.to_csv(data_dir + field + "_missing.csv", index=False)
missing

Unnamed: 0,issn_l,journal
7,2666-139X,Technology and Regulation
19,2736-4321,Journal of Cross-Disciplinary Research in Comp...
20,1543-8899,Journal of Telecommunications and High Technol...


In [13]:
print("Found", len(sources), "sources in total for", field)

Found 34 sources in total for techlaw


## Get works

In [14]:
s_ids_names = {s['id'].replace('https://openalex.org/','') : s['display_name'] for s in sources}

In [15]:
import time
works = []
for s_id in s_ids_names.keys():
    print("Getting source:", s_ids_names[s_id])
    paginator = Works().filter(primary_location={"source" : {"id" : s_id}}).paginate(per_page=200)
    works += list(itertools.chain.from_iterable(paginator))
    time.sleep(10)

Getting source: Computer Law & Security Review
Getting source: Journal of Intellectual Property Law & Practice
Getting source: Computer law review international
Getting source: Artificial Intelligence and Law
Getting source: Internet policy review
Getting source: Journal Of Media Law
Getting source: Journal of High Technology Law
Getting source: Journal of Technology Law & Policy
Getting source: Industrial Law Journal
Getting source: European journal of risk regulation
Getting source: International Review of Law, Computers & Technology
Getting source: European Data Protection Law Review
Getting source: International Journal of Law and Information Technology
Getting source: International Data Privacy Law
Getting source: Journal of International Commercial Law and Technology
Getting source: Law, Innovation and Technology
Getting source: Journal of responsible technology
Getting source: European review of private law
Getting source: Behavioral Sciences & The Law
Getting source: Informatio

In [17]:
df = pd.DataFrame(works)
df['abstract'] = [w['abstract'] for w in works]

In [21]:
df.to_json(data_dir + field + ".json")