# Open Access versions of articles in Australian HASS journals

This is work-in-progress. I'm currently trying to work out why the Unpaywall API reports some articles as 'closed' when they are actually 'bronze'. This obviously affects some of the results below.

[How Unpaywall calculates OA status](https://support.unpaywall.org/support/solutions/articles/44001777288)

In [213]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests_cache
from tqdm.auto import tqdm
import pandas as pd
import altair as alt
import collections

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

tqdm.pandas(desc="records")

  from pandas import Panel


In [86]:
email = 'tim@discontents.com.au'

In [223]:
def get_total_results(issn):
    '''
    Get the total number of articles in CrossRef for this journal.
    '''
    response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params={'rows': 0})
    data = response.json()
    try:
        total_works = data['message']['total-results']
    except KeyError:
        total_works = 0
    return total_works

def get_title(record):
    '''
    Titles are in a list – join any values
    '''
    title = record.get('title')
    if isinstance(title, list):
        title = ' – '.join(title)
    return title

def harvest_works(issn):
    '''
    Harvest basic details (DOI, title, date) of articles from the journal with the supplied ISSN from CrossRef.
    '''
    harvested = 0
    works = []
    total_results = get_total_results(issn)
    params = {
        'rows': 100,
        'offset': 0
    }
    headers = {
        'User-Agent': f'Jupyter Notebook (mailto:{email})'
    }
    with tqdm(total=total_results) as pbar:
        while harvested <= total_results:
            params['offset'] = harvested
            response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params=params, headers=headers)
            data = response.json()
            try:
                records = data['message']['items']
            except TypeError:
                print('TYPEERROR')
                print(data)
            else:
                for record in records:
                    try:
                        works.append({'doi': record.get('DOI'), 'title': get_title(record), 'year': record['issued']['date-parts'][0][0]})
                    except KeyError:
                        print('KEYERROR')
                        print(record)
            harvested += 100
            pbar.update(len(data['message']['items']))
    return works

def get_oa_status(doi):
    '''
    Get OA status of DOI from the Unpaywall API.
    '''
    response = s.get(f'https://api.unpaywall.org/v2/{doi}?email={email}')
    data = response.json()
    return data['oa_status']

def create_scale(df):
    '''
    Set colour range to match the OA status types.
    '''
    scale = []
    colours = collections.OrderedDict()
    colours['hybrid'] = 'gold'
    colours['green'] = 'green'
    colours['bronze'] = 'brown'
    colours['closed'] = 'lightgrey'
    status_values = list(df['oa_status'].unique())
    for status, colour in colours.items():
        if status in status_values:
            scale.append(colour)
    return scale

def chart_oa_status(df, title):
    # Adding a numeric order column makes it easy to sort by oa_status
    df['order'] = df['oa_status'].replace({val: i for i, val in enumerate(['closed', 'bronze', 'green', 'hybrid'])})
    # Get colour values
    scale = create_scale(df)
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X('year:O', title='Year'),
        y=alt.Y('count():Q', title='Number of articles', axis=alt.Axis(tickMinStep=1)),
        color=alt.Color('oa_status:N', scale=alt.Scale(range=scale), legend=alt.Legend(title='OA type'), sort=alt.EncodingSortField('order', order='descending')),
        order='order',
        tooltip=[alt.Tooltip('count():Q', title='Number of articles'), alt.Tooltip('oa_status', title='OA type')]
    ).properties(title=title)
    display(chart)

## Australian Historical Studies

In [69]:
works_ahs = harvest_works('1031-461X')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1548.0), HTML(value='')))




In [70]:
df_ahs = pd.DataFrame(works)
df_ahs.shape

(1548, 3)

In [71]:
# Make sure there's no duplicates
df_ahs.drop_duplicates(inplace=True)
df_ahs.shape

(1548, 3)

In [72]:
df_ahs['title'].value_counts()[:25]

Editorial board                                                                                    36
Books                                                                                              35
Book notes                                                                                         30
In this issue                                                                                      20
In This Issue                                                                                      16
Notes on Contributors                                                                              16
Exhibitions                                                                                        12
Book reviews                                                                                       12
Book Notes                                                                                         10
Exhibition                                                                        

In [73]:
# Get rid of titles that appear more than once
df_ahs_unique = df_ahs.copy().drop_duplicates(subset='title', keep=False)
df_ahs_unique.shape

(1305, 3)

In [74]:
df_ahs_unique['oa_status']  = df_ahs_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=1305.0), HTML(value='')))




In [75]:
df_ahs_unique['oa_status'].value_counts()

closed    1236
green       36
bronze      28
hybrid       5
Name: oa_status, dtype: int64

In [85]:
df_ahs_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    94.7%
green      2.8%
bronze     2.1%
hybrid     0.4%
Name: oa_status, dtype: object

In [224]:
chart_oa_status(df_ahs_unique, title='Australian Historical Studies')

## History Australia

In [40]:
works_ha = harvest_works('1449-0854')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1249.0), HTML(value='')))




In [42]:
df_ha = pd.DataFrame(works_ha)
df_ha.shape

(1249, 3)

In [49]:
df_ha.drop_duplicates(inplace=True)
df_ha.shape

(1249, 3)

In [46]:
df_ha.loc[df_ha['title'].isnull()]

Unnamed: 0,doi,title,year
101,10.2104/ha.2007.4.issue-2,,2007
214,10.2104/ha.2006.3.issue-1,,2006
758,10.2104/ha.2008.5.issue-2,,2008
867,10.2104/ha.2008.5.issue-3,,2008
994,10.2104/ha.2006.3.issue-2,,2006
1040,10.2104/ha.2007.4.issue-1,,2007


In [50]:
df_ha.dropna(subset=['title'], inplace=True)
df_ha.shape

(1243, 3)

In [45]:
df_ha['title'].value_counts()[:30]

From the President                                                                                                      46
From the Editors                                                                                                        35
AHA Honour Roll                                                                                                         15
Exhibition Reviews                                                                                                      14
AHA Calendar of Events                                                                                                  12
Book Reviews                                                                                                            11
From the Editor                                                                                                         10
AHA Prize and Award Winners                                                                                              9
Australian Histo

In [51]:
df_ha_unique = df_ha.copy().drop_duplicates(subset='title', keep=False)
df_ha_unique.shape

(1039, 3)

In [65]:
df_ha_unique['oa_status']  = df_ha_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=1039.0), HTML(value='')))




In [80]:
df_ha_unique['oa_status'].value_counts()

closed    986
green      27
bronze     25
hybrid      1
Name: oa_status, dtype: int64

In [84]:
df_ha_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    94.9%
green      2.6%
bronze     2.4%
hybrid     0.1%
Name: oa_status, dtype: object

In [148]:
chart_oa_status(df_ha_unique, title='History Australia')

## Australian Journal of Politics and History

In [132]:
works_ajph = harvest_works('1467-8497')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1944.0), HTML(value='')))




In [133]:
df_ajph = pd.DataFrame(works_ajph)
df_ajph.shape

(1944, 3)

In [134]:
df_ajph.drop_duplicates(inplace=True)
df_ajph.shape

(1944, 3)

In [135]:
df_ajph.loc[df_ajph['title'].isnull()]

Unnamed: 0,doi,title,year
57,10.1111/ajph.2008.54.issue-4,,2008
63,10.1111/ajph.2009.55.issue-1,,2009
80,10.1111/ajph.2008.54.issue-3,,2008
86,10.1111/ajph.2000.46.issue-1,,2000
87,10.1111/ajph.2002.48.issue-1,,2002
...,...,...,...
1818,10.1111/ajph.v66.1,,2020
1868,10.1111/ajph.v66.4,,2020
1869,10.1111/ajph.v65.4,,2019
1907,10.1111/ajph.v66.2,,2020


In [136]:
df_ajph.dropna(subset=['title'], inplace=True)
df_ajph.shape

(1787, 3)

In [137]:
df_ajph['title'].value_counts()[:40]

Book Reviews                                                                                     106
Book Notes                                                                                        52
QUEENSLAND                                                                                        18
SOUTH AUSTRALIA                                                                                   17
VICTORIA                                                                                          17
TASMANIA                                                                                          17
Political Chronicles                                                                              16
WESTERN AUSTRALIA                                                                                 15
NEW SOUTH WALES                                                                                   15
BOOK REVIEWS                                                                               

In [138]:
df_ajph_unique = df_ajph.copy().drop_duplicates(subset='title', keep=False)
df_ajph_unique.shape

(1400, 3)

In [139]:
df_ajph_unique['oa_status']  = df_ajph_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=1400.0), HTML(value='')))




In [140]:
df_ajph_unique['oa_status'].value_counts()

closed    1340
bronze      36
green       22
hybrid       2
Name: oa_status, dtype: int64

In [141]:
df_ajph_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    95.7%
bronze     2.6%
green      1.6%
hybrid     0.1%
Name: oa_status, dtype: object

In [147]:
chart_oa_status(df_ajph_unique, title='Australian Journal of Politics and History')

## Journal of Australian Studies

ISSN: '1444-3058'

In [110]:
works_jas = harvest_works('1444-3058')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2113.0), HTML(value='')))




In [111]:
df_jas = pd.DataFrame(works_jas)
df_jas.shape

(2113, 3)

In [112]:
df_jas.drop_duplicates(inplace=True)
df_jas.shape

(2113, 3)

In [113]:
df_jas.loc[df_jas['title'].isnull()]

Unnamed: 0,doi,title,year


In [114]:
df_jas.dropna(subset=['title'], inplace=True)
df_jas.shape

(2113, 3)

In [115]:
df_jas['title'].value_counts()[:30]

Editorial board                                                                                                                                         49
Notes on contributors                                                                                                                                   40
Notes                                                                                                                                                   32
Contributors                                                                                                                                            31
Notes on Contributors                                                                                                                                   28
Reviews                                                                                                                                                 28
Book reviews                                                          

In [116]:
df_jas_unique = df_jas.copy().drop_duplicates(subset='title', keep=False)
df_jas_unique.shape

(1732, 3)

In [117]:
df_jas_unique['oa_status']  = df_jas_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=1732.0), HTML(value='')))




In [118]:
df_jas_unique['oa_status'].value_counts()

closed    1632
green       71
bronze      26
hybrid       3
Name: oa_status, dtype: int64

In [119]:
df_jas_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    94.2%
green      4.1%
bronze     1.5%
hybrid     0.2%
Name: oa_status, dtype: object

In [146]:
chart_oa_status(df_jas_unique, title='Journal of Australian Studies')

## Australian Archaeology

ISSN: 0312-2417

In [144]:
works_aa = harvest_works('0312-2417')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1484.0), HTML(value='')))




In [150]:
df_aa = pd.DataFrame(works_aa)
df_aa.shape

(1484, 3)

In [151]:
df_aa.drop_duplicates(inplace=True)
df_aa.shape

(1484, 3)

In [152]:
df_aa.loc[df_aa['title'].isnull()]

Unnamed: 0,doi,title,year


In [153]:
df_aa.dropna(subset=['title'], inplace=True)
df_aa.shape

(1484, 3)

In [154]:
df_aa['title'].value_counts()[:30]

Editorial                                                                                                                                                        57
Book Reviews                                                                                                                                                     34
Front Matter                                                                                                                                                     27
Thesis Abstracts                                                                                                                                                 26
Backfill                                                                                                                                                         23
editorial                                                                                                                                                         8
debitage        

In [155]:
df_aa_unique = df_aa.copy().drop_duplicates(subset='title', keep=False)
df_aa_unique.shape

(1268, 3)

In [156]:
df_aa_unique['oa_status']  = df_aa_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=1268.0), HTML(value='')))




In [157]:
df_aa_unique['oa_status'].value_counts()

closed    1057
green      188
bronze      21
hybrid       2
Name: oa_status, dtype: int64

In [158]:
df_aa_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    83.4%
green     14.8%
bronze     1.7%
hybrid     0.2%
Name: oa_status, dtype: object

In [160]:
chart_oa_status(df_aa_unique, title='Australian Archaeology')

## Archives and Manuscripts

ISSN: 0157-6895

In [161]:
works_am = harvest_works('0157-6895')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=340.0), HTML(value='')))




In [162]:
df_am = pd.DataFrame(works_am)
df_am.shape

(340, 3)

In [163]:
df_am.drop_duplicates(inplace=True)
df_am.shape

(340, 3)

In [164]:
df_am.loc[df_am['title'].isnull()]

Unnamed: 0,doi,title,year


In [165]:
df_am.dropna(subset=['title'], inplace=True)
df_am.shape

(340, 3)

In [166]:
df_am['title'].value_counts()[:30]

Editorial                                                                                                                                                          14
Editorial Board                                                                                                                                                     3
Archival Anxiety and the Vocational Calling                                                                                                                         2
Corrigendum                                                                                                                                                         2
Records and Information Management                                                                                                                                  2
Unresolved issues: recordkeeping recommendations arising from Australian commissions of inquiry into the welfare of children in out-of-home care, 1997–2012         1
Disp

In [167]:
df_am_unique = df_am.copy().drop_duplicates(subset='title', keep=False)
df_am_unique.shape

(317, 3)

In [168]:
df_am_unique['oa_status']  = df_am_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=317.0), HTML(value='')))




In [169]:
df_am_unique['oa_status'].value_counts()

closed    243
bronze     46
green      26
hybrid      2
Name: oa_status, dtype: int64

In [170]:
df_am_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    76.7%
bronze    14.5%
green      8.2%
hybrid     0.6%
Name: oa_status, dtype: object

In [172]:
chart_oa_status(df_am_unique, title='Archives and Manuscripts')

## Journal of the Australian Library and Information Association
Australian Academic and Research Libraries ISSN: 0004-8623

Journal of ALIA ISSN: 2475-0158

In [173]:
works_aarn = harvest_works('0004-8623')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1335.0), HTML(value='')))




In [190]:
df = pd.DataFrame(works_aarn)
df.loc[df['year'] == 2000].shape

(32, 3)

In [174]:
works_jalia = harvest_works('2475-0158')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=334.0), HTML(value='')))




In [176]:
df_jalia = pd.concat([pd.DataFrame(works_aarn), pd.DataFrame(works_jalia)]) 
df_jalia.shape

(1669, 3)

In [177]:
df_jalia.drop_duplicates(inplace=True)
df_jalia.shape

(1669, 3)

In [178]:
df_jalia.loc[df_jalia['title'].isnull()]

Unnamed: 0,doi,title,year


In [179]:
df_jalia.dropna(subset=['title'], inplace=True)
df_jalia.shape

(1669, 3)

In [182]:
df_jalia['title'].value_counts()[:40]

Publications Received                                                                                                        66
Book Reviews                                                                                                                 55
Editorial                                                                                                                    50
Front Matter                                                                                                                 26
Reviews                                                                                                                      26
Review Article                                                                                                               14
Conference Reports                                                                                                            8
Conference Report                                                                                       

In [192]:
df_jalia_unique = df_jalia.copy().drop_duplicates(subset='title', keep=False)
df_jalia_unique.shape

(1323, 5)

In [185]:
df_jalia_unique['oa_status']  = df_jalia_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=1323.0), HTML(value='')))




In [186]:
df_jalia_unique['oa_status'].value_counts()

closed    695
bronze    561
green      66
hybrid      1
Name: oa_status, dtype: int64

In [187]:
df_jalia_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    52.5%
bronze    42.4%
green      5.0%
hybrid     0.1%
Name: oa_status, dtype: object

In [188]:
chart_oa_status(df_jalia_unique, title='Journal of the Australian Library and Information Association')

In [206]:
df_jalia_unique.loc[df_jalia_unique['year'] == 2000]

Unnamed: 0,doi,title,year,oa_status,order
667,10.1080/00048623.2000.10755120,Horst Kunze and Living History,2000,closed,0
687,10.1080/00048623.2000.10755130,Perfect One Day—Digital The Next: Challenges i...,2000,closed,0
697,10.1080/00048623.2000.10755137,Australian Library and Information Association...,2000,bronze,1
701,10.1080/00048623.2000.10755134,Documenting The Business of Government—Archiva...,2000,bronze,1
742,10.1080/00048623.2000.10755109,Library Provided Information and Clinical Deci...,2000,bronze,1
750,10.1080/00048623.2000.10755118,Improving Access for the Public to the Collect...,2000,closed,0
752,10.1080/00048623.2000.10755115,Errata:The Chinese Advertiser,2000,bronze,1
753,10.1080/00048623.2000.10755112,ALIA Vouchers and the GST,2000,closed,0
755,10.1080/00048623.2000.10755108,From Vision to Reality: The Evolution of a Lib...,2000,green,2
758,10.1080/00048623.2000.10755117,Passive Environmental Control for Small Cultur...,2000,bronze,1


## Labour History

ISSN: 0023-6942

In [191]:
works_lh = harvest_works('0023-6942')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2792.0), HTML(value='')))




In [198]:
df_lh = pd.DataFrame(works_lh)
df_lh.shape

(2792, 3)

In [199]:
df_lh.drop_duplicates(inplace=True)
df_lh.shape

(2792, 3)

In [200]:
df_lh.loc[df_lh['title'].isnull()]

Unnamed: 0,doi,title,year


In [201]:
df_lh.dropna(subset=['title'], inplace=True)
df_lh.shape

(2792, 3)

In [202]:
df_lh['title'].value_counts()[:30]

                                                                                                                280
Review                                                                                                           54
Front Matter                                                                                                     19
Back Matter                                                                                                      17
EDITORIAL                                                                                                         8
Editorial                                                                                                         8
Introduction                                                                                                      4
Notice Board                                                                                                      3
The Labor Government in the Second World War: A Memoir                  

In [204]:
df_lh_unique = df_lh.copy().drop_duplicates(subset='title', keep=False)
df_lh_unique.shape

(2375, 3)

In [205]:
df_lh_unique['oa_status']  = df_lh_unique['doi'].progress_apply(get_oa_status)

HBox(children=(HTML(value='records'), FloatProgress(value=0.0, max=2375.0), HTML(value='')))




In [207]:
df_lh_unique['oa_status'].value_counts()

closed    2229
green      146
Name: oa_status, dtype: int64

In [208]:
df_lh_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

closed    93.9%
green      6.1%
Name: oa_status, dtype: object

In [225]:
chart_oa_status(df_lh_unique, title='Labour History')