In [19]:
import pandas as pd
import time
import requests
import pickle

In [12]:
df = pd.read_csv("../csv/2019_crossref_plus_pubmed.csv", index_col=0)
df.shape

(24909, 9)

In [15]:
df.iloc[:50]

Unnamed: 0,pmid,doi,affiliations,abstract,authors,journal title,article title,num_times_cited,num_references
0,30872305,10.7861/clinmedicine.19-2-169,['Walton Centre NHS Foundation Trust'],Neuromyelitis optica spectrum disorder (NMOSD)...,"['Huda, Saif', 'Whittam, Dan', 'Bhojak, Manees...",Clinical Medicine,Neuromyelitis optica spectrum disorders,83.0,60.0
1,30609105,10.1111/ced.13891,['Sheffield Teaching Hospitals NHS Foundation ...,Erythromelalgia is a condition characterized b...,"['Mann, N.', 'King, T.', 'Murphy, R.']",Clinical and Experimental Dermatology,Review of primary and secondary erythromelalgia,24.0,29.0
3,30525757,10.21037/cco.2018.11.03,"['University Hospital Birmingham', 'University...",Mycosis fungoides (MF) represents the majority...,"['Lovgren, Marie-Louise', 'Scarisbrick, Julia ...",Chinese Clinical Oncology,Update on skin directed therapies in mycosis f...,15.0,0.0
4,30500591,10.1016/j.wneu.2018.11.176,['Derriford Hospital'],BACKGROUND: Malignant middle cerebral artery i...,"['Das, Suparna', 'Mitchell, Patrick', 'Ross, N...",World Neurosurgery,Decompressive Hemicraniectomy in the Treatment...,28.0,40.0
5,30657163,10.1002/14651858.CD006583.pub5,['Pennine Acute Hospitals NHS Trust'],Update of Cochrane Database Syst Rev. 2015 ...,"['Ahmad, Gaity', 'Baker, Jade', 'Finnerty, Joh...",Cochrane Database of Systematic Reviews,Laparoscopic entry techniques,20.0,125.0
6,30242142,10.1136/heartjnl-2017-312755,['Imperial College Healthcare NHS Trust'],The adverse consequences of stable coronary ar...,"['Al-Lamee, Rasha K', 'Nowbar, Alexandra N', '...",Heart,Percutaneous coronary intervention for stable ...,19.0,48.0
7,30888709,10.1002/14651858.CD011847.pub2,"['Nottinghamshire Healthcare NHS Trust', 'Bass...",Update of doi: 10.1002/14651858.CD011847.,"['Sinclair, Diarmid JM', 'Zhao, Sai', 'Qi, Fan...",Cochrane Database of Systematic Reviews,Electroconvulsive therapy for treatment-resist...,8.0,159.0
8,30572752,10.1177/0967772018821839,[],,"['Larner, AJ']",Journal of Medical Biography,Editorial,0.0,6.0
9,30651265,10.7861/clinmedicine.19-1-91,['Oxford University Hospitals NHS Foundation T...,Comment on Clin Med (Lond). 2018 Aug;18(4):...,"['Brierley, Charlotte K', 'Pavord, Sue']",Clinical Medicine,Response,0.0,0.0
10,30936704,10.2147/TCRM.S160327,[],Despite advances in the diagnosis and manageme...,"['Gruffydd-Jones, Kevin']",Therapeutics and Clinical Risk Management,<p>Unmet needs in asthma</p>,13.0,0.0


In [10]:
df.isnull().sum()

date_published                 117
authors                          0
journal_title                  117
article_title                  117
all_num_citations_crossref     117
all_num_references_crossref    117
dtype: int64

In [41]:
email_address = "yiwench@gmail.com"

def get_info_from_unpaywall(df = df, tempsavepath = "../csv/test_2019_unpaywall_temp.csv",
                            finalsavepath = "../csv/test_2019_unpaywall.csv"):
    start_time = time.ctime()
    all_date_published_upw = []
    all_is_oa = []
    all_oa_status = []
    all_oa_locations = []
    all_genre = []
    errors = []
    for i in range(50):
        doi = df.iloc[i].doi
        url = f"https://api.unpaywall.org/v2/{doi}?email=yiwench@gmail.com"
        response = requests.get(url)
        try:
            metadata = response.json()
            if response.ok == True:
                # get publication date
                pub_date = metadata.get("published_date", "NaN")
                all_date_published_upw.append(pub_date)
                # get is_oa
                is_oa = metadata.get("is_oa", "NaN")
                all_is_oa.append(is_oa)
                # get oa_status
                oa_status = metadata.get("oa_status", "NaN")
                all_oa_status.append(oa_status)
                # get oa_locations
                oa_locations = metadata.get("oa_locations", "NaN")
                all_oa_locations.append(oa_locations)
                # get genre
                genre = metadata.get("genre", "NaN")
                all_genre.append(genre)
                # get missing data if not available through crossref
                if df.iloc[i].journal_title == "NaN":
                    df.iloc[i].journal_title = metadata.get("journal_name", "NaN")
                if df.iloc[i].article_title == "NaN":
                    df.iloc[i].article_title = metadata.get("title", "NaN")
            else:
                all_date_published_upw.append("NaN")
                all_is_oa.append("NaN")
                all_oa_status.append("NaN")
                all_oa_locations.append("NaN")
                all_genre.append("NaN")
        except:
            errors.append([doi])
# save temporary files
        if i % 100 == 0:
            mini_df_dict = {"date_published_upw" : all_date_published_upw,
                            "is_oa" : all_is_oa,
                            "oa_status": all_oa_status,
                            "oa_locations": all_oa_locations,
                            "genre" : all_genre
                            }
            mini_df = pd.DataFrame(mini_df_dict)
            mini_df.to_csv(tempsavepath)
            print(f"{i} / {df.shape[0]} articles metadata obtained from unpaywall")
            
    unpaywall_df_dict = {"date_published_upw" : all_date_published_upw,
                            "is_oa" : all_is_oa,
                            "oa_status": all_oa_status,
                            "oa_locations": all_oa_locations,
                            "genre" : all_genre
                            }
    unpaywall_df = pd.DataFrame(unpaywall_df_dict)
    unpaywall_df.to_csv(finalsavepath)
    print(f"Finished. Start time: {start_time}. Finish time: {time.ctime()}")
    with open("errors.pkl", "wb") as f:
        pickle.dump(errors, f)
    return unpaywall_df

In [42]:
unpaywall_df = get_info_from_unpaywall()

0 / 24909 articles metadata obtained from unpaywall
Finished. Start time: Mon Apr 11 22:26:13 2022. Finish time: Mon Apr 11 22:26:23 2022


In [44]:
unpaywall_df

Unnamed: 0,date_published_upw,is_oa,oa_status,oa_locations,genre
0,2019-03-01,True,gold,"[{'updated': '2020-11-23T10:26:42.514123', 'ur...",journal-article
1,2019-01-04,True,bronze,"[{'updated': '2021-09-15T06:51:20.019047', 'ur...",journal-article
2,2019-02-01,True,gold,"[{'updated': '2021-01-09T06:49:47.406929', 'ur...",journal-article
3,2019-03-01,False,closed,[],journal-article
4,2019-01-18,True,green,"[{'updated': None, 'url': 'https://europepmc.o...",journal-article
5,2018-09-21,False,closed,[],journal-article
6,2019-03-19,True,green,"[{'updated': None, 'url': 'https://europepmc.o...",journal-article
7,2018-12-20,True,bronze,"[{'updated': '2021-02-20T17:17:50.654199', 'ur...",journal-article
8,2019-01-01,True,gold,"[{'updated': '2020-11-03T13:09:50.632493', 'ur...",journal-article
9,2019-03-01,True,gold,"[{'updated': '2022-04-11T21:26:20.123138', 'ur...",journal-article
