## Setup and imports

In [17]:
%pip install pyarrow

distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
Collecting pyarrow
  Downloading pyarrow-9.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 586 kB/s eta 0:00:01
Installing collected packages: pyarrow
distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
Successfully installed pyarrow-9.0.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import math
import json
from pathlib import Path
import pickle
import time

from IPython.display import display, clear_output
import numpy as np
import pandas as pd
import requests

%matplotlib inline

DATA_DIR = Path('./data')

PER_PAGE = 200 # API max
SHARED_FILTERS = "type:journal-article,publication_year:>2010,publication_year:<2018"

## Build core DataFrame

In [2]:
# input
RAW_COLLATED_PATH = DATA_DIR / 'raw_collated.json'
# output
CORE_DF_PATH = DATA_DIR / 'core_df.feather'

In [3]:
with RAW_COLLATED_PATH.open('r', encoding='UTF-8') as infile:
    raw_records = json.load(infile)

In [4]:
# verify no uniques -- these should be equal
print(len(raw_records))
print(len(set([r['id'] for r in raw_records])))

113937
113937


In [5]:
# example record
for k, v in raw_records[0].items():
    print(k, type(v), v, '\n')

id <class 'str'> https://openalex.org/W2737572559 

doi <class 'str'> https://doi.org/10.1016/j.scitotenv.2017.07.125 

title <class 'str'> Perceived usefulness of personal protective equipment in pesticide use predicts farmers' willingness to use it 

display_name <class 'str'> Perceived usefulness of personal protective equipment in pesticide use predicts farmers' willingness to use it 

publication_year <class 'int'> 2017 

publication_date <class 'str'> 2017-12-31 

ids <class 'dict'> {'openalex': 'https://openalex.org/W2737572559', 'doi': 'https://doi.org/10.1016/j.scitotenv.2017.07.125', 'mag': '2737572559', 'pmid': 'https://pubmed.ncbi.nlm.nih.gov/28755601'} 

host_venue <class 'dict'> {'id': 'https://openalex.org/V86852077', 'issn_l': '0048-9697', 'issn': ['0048-9697', '1879-1026'], 'display_name': 'Science of The Total Environment', 'publisher': 'Elsevier', 'type': 'publisher', 'url': 'https://doi.org/10.1016/j.scitotenv.2017.07.125', 'is_oa': False, 'version': None, 'license'

In [12]:
# Some fields we might use but aren't include
#  'doi' -- for these data just use OpenAlexID
#  'type' -- all are journal-article
#  'is_paratext' -- none are paratex
#  'title' -- can be nice to look at but not using directly in the core DF
#  'publication_date' -- just using publication year
#  'cited_by_api_url' -- this is equal to "https://api.openalex.org/works?filter=cites:"+id

def flatten_raw_record(rec):
    new = {}
        
    new['id'] = rec['id'].replace('https://openalex.org/', '')
    
    for k in ('is_retracted', 'publication_year', 'cited_by_count'):
        new[k] = int(rec[k])

    new['ref_length'] = len(rec['referenced_works'])
    new['journal_id'] = rec['host_venue']['id'].replace('https://openalex.org/', '')
    new['journal_name'] = rec['host_venue']['display_name']
    new['n_authors'] = len(rec['authorships'])
    new['abstract_indexed'] = int(rec['abstract_inverted_index'] is not None)
    
    new['has_doi'] = int(rec['doi'] is not None)
    new['has_pmid'] = int('pmid' in rec['ids'])
    
    # https://docs.openalex.org/about-the-data/concept
    # taking the top concept in each level specified into concept0, concept1, ... fields
    for level in (0, 1):
        field = f'concept{level}'
        r_concepts = [c for c in rec['concepts'] if c['level'] == level]
        if r_concepts:
        # sort by score and take the highest score's name -- could take concept.id instead
            new[field] = sorted(r_concepts, key=lambda x:x['score'])[-1]['display_name']
        else:
            new[field] = None

    return(new)

print(*flatten_raw_record(raw_records[0]).items(), sep='\n')


('id', 'W2737572559')
('is_retracted', 1)
('publication_year', 2017)
('cited_by_count', 28)
('ref_length', 38)
('journal_id', 'V86852077')
('journal_name', 'Science of The Total Environment')
('n_authors', 3)
('abstract_indexed', 1)
('has_doi', 1)
('has_pmid', 1)
('concept0', 'Business')
('concept1', 'Toxicology')


In [13]:
core_df = pd.DataFrame([flatten_raw_record(rec) for rec in raw_records])
core_df
for field in core_df.columns:
    print('\n')
    print(field)
    print(core_df[field].value_counts())



id
W2769345897    1
W4252342547    1
W2112961075    1
W1985691903    1
W1879530215    1
              ..
W2186453442    1
W1969110046    1
W2174429140    1
W2775570563    1
W2768304598    1
Name: id, Length: 113937, dtype: int64


is_retracted
0    104391
1      9546
Name: is_retracted, dtype: int64


publication_year
2017    17696
2015    17584
2016    17483
2014    16574
2012    16136
2013    14996
2011    13468
Name: publication_year, dtype: int64


cited_by_count
0      16666
1       7011
2       5909
3       5207
4       4861
       ...  
858        1
730        1
538        1
559        1
511        1
Name: cited_by_count, Length: 587, dtype: int64


ref_length
0      14992
28      2161
30      2160
25      2138
26      2103
       ...  
795        1
284        1
412        1
477        1
221        1
Name: ref_length, Length: 375, dtype: int64


journal_id
V202381698     436
V12644804      327
V140251998     284
V196734849     197
V68497187      190
              ... 
V4210200

In [14]:
core_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
is_retracted,113937.0,0.083783,0.277064,0.0,0.0,0.0,0.0,1.0
publication_year,113937.0,2014.157684,1.975616,2011.0,2012.0,2014.0,2016.0,2017.0
cited_by_count,113937.0,20.989205,68.317896,0.0,2.0,9.0,23.0,9992.0
ref_length,113937.0,31.629725,31.869641,0.0,12.0,27.0,43.0,1355.0
n_authors,113937.0,4.935543,14.337369,0.0,2.0,4.0,6.0,2864.0
abstract_indexed,113937.0,0.870981,0.335222,0.0,1.0,1.0,1.0,1.0
has_doi,113937.0,0.971116,0.167482,0.0,1.0,1.0,1.0,1.0
has_pmid,113937.0,0.593319,0.491217,0.0,0.0,1.0,1.0,1.0


In [15]:
core_df.describe(include='all').T.fillna('')

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,113937.0,113937.0,W2769345897,1.0,,,,,,,
is_retracted,113937.0,,,,0.083783,0.277064,0.0,0.0,0.0,0.0,1.0
publication_year,113937.0,,,,2014.157684,1.975616,2011.0,2012.0,2014.0,2016.0,2017.0
cited_by_count,113937.0,,,,20.989205,68.317896,0.0,2.0,9.0,23.0,9992.0
ref_length,113937.0,,,,31.629725,31.869641,0.0,12.0,27.0,43.0,1355.0
journal_id,113937.0,3013.0,V202381698,436.0,,,,,,,
journal_name,113937.0,3010.0,PLOS ONE,436.0,,,,,,,
n_authors,113937.0,,,,4.935543,14.337369,0.0,2.0,4.0,6.0,2864.0
abstract_indexed,113937.0,,,,0.870981,0.335222,0.0,1.0,1.0,1.0,1.0
has_doi,113937.0,,,,0.971116,0.167482,0.0,1.0,1.0,1.0,1.0


In [18]:
core_df.to_feather(CORE_DF_PATH)
print(CORE_DF_PATH.relative_to('.'), CORE_DF_PATH.exists())

data/core_df.feather True
