In [1]:
import pandas as pd
import numpy as np
import requests


In [2]:
df = pd.read_csv('../csv/2019_pubmed_data_parsed.csv', index_col=0)
df.head()

Unnamed: 0,pmid,doi,affiliations,abstract
0,30872305,10.7861/clinmedicine.19-2-169,['Walton Centre NHS Foundation Trust'],Neuromyelitis optica spectrum disorder (NMOSD)...
1,30609105,10.1111/ced.13891,['Sheffield Teaching Hospitals NHS Foundation ...,Erythromelalgia is a condition characterized b...
2,31198893,,"['Royal Wolverhampton NHS Trust', 'University ...",Objective. Corneal neovascularization is a sig...
3,30525757,10.21037/cco.2018.11.03,"['University Hospital Birmingham', 'University...",Mycosis fungoides (MF) represents the majority...
4,30500591,10.1016/j.wneu.2018.11.176,['Derriford Hospital'],BACKGROUND: Malignant middle cerebral artery i...


In [3]:
df.isnull().sum()

pmid               0
doi              165
affiliations       0
abstract        2795
dtype: int64

In [4]:
df.shape

(25074, 4)

In [5]:
no_doi = df["doi"].notna()
df = df[no_doi]
df.shape

(24909, 4)

In [6]:
count = 0
for i in range(df.shape[0]):
    df.iloc[i].doi
    count +=1
print(count)

24909


In [7]:

headers = {
    'User-Agent': 'NHS_OA (https://github.com/yiwen-h/nhs_oa/; mailto:yiwench@gmail.com)',
    'From': 'yiwench@gmail.com'
}

all_date_published = []
all_author_list = []
all_journal_title = []
all_article_title = []
all_num_citations_crossref = []
all_num_references_crossref = []


for i in range(df.shape[0]):
    doi = df.iloc[i].doi
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url, headers=headers)
    try:
        response = response.json()
        metadata = response.get('message')
        # get publication date
        date_as_list = metadata.get('published').get('date-parts')[0]
        try:
            pub_date = f"{date_as_list[0]}-{date_as_list[1]}"
        except:
            pub_date = f"{date_as_list[0]}"
        all_date_published.append(pub_date)
        # get authors
        author_list = []
        authors_metadata = metadata.get('author')
        for author in authors_metadata:
            author_name = f"{author.get('family')}, {author.get('given')}"
            author_list.append(author_name)
        all_author_list.append(author_list)
        # get journal title
        all_journal_title.append(metadata.get('container-title')[0])
        # get article title
        all_article_title.append(metadata.get('title')[0])
        # get num_citations
        all_num_citations_crossref.append(metadata.get("is-referenced-by-count"))
        # get num_references
        all_num_references_crossref.append(metadata.get("references-count"))
    except:
        all_date_published.append("NaN")
        all_author_list.append(["NaN"])
        all_journal_title.append("NaN")
        all_article_title.append("NaN")
        all_num_citations_crossref.append("NaN")
        all_num_references_crossref.append("NaN")

In [16]:
df['authors'] = all_author_list
df['journal title'] = all_journal_title
df['article title'] = all_article_title
df['num_times_cited'] = all_num_citations_crossref
df['num_references'] = all_num_references_crossref

In [17]:
df.to_csv("../csv/2019_crossref_plus_pubmed.csv")

In [9]:
print(len(all_date_published))
print(len(all_author_list))
print(len(all_journal_title))
print(len(all_article_title))
print(len(all_num_citations_crossref))
print(len(all_num_references_crossref))

24935
24909
24909
24909
24909
24909


In [13]:
len([x for x in all_date_published if x == "NaN"])

189

In [15]:
print(all_date_published)

['2019-3', '2019-1', '2019-2', '2019-3', '2019-1', '2018-9', '2019-3', '2018-12', '2019-1', '2019-3', '2019-3', '2019-1', '2019-1', '2019', '2019-4', '2019-2', '2019-1', '2019-1', '2019-3', '2019-1', '2019-2', '2019-7', '2019-5', '2019-1', '2019-1', '2019-1', '2019-11', '2019-3', '2019-12', '2019-7', '2019-1', '2019-3', '2019-3', '2019-3', '2019-2', '2019-1', '2019-8', '2019-6', '2020-1', '2019-3', '2019-7', '2019-2', '2019-7', '2019-4', '2019-2', '2019-5', '2019-1', '2019-2', '2019-1', '2019-1', '2019-5', '2019-3', '2018-12', '2019-2', '2019-9', '2019-4', '2019-2', '2019-2', '2019-1', '2019-3', '2019-1', '2018-5', '2019-3', '2019-3', '2019-11', '2019-1', '2019-2', '2019-1', '2019-3', '2019-4', '2019-5', '2019-3', '2019-2', '2019-6', '2019-3', '2019-4', '2019-3', '2018-11', '2019-3', '2019-1', '2018-12', '2018-12', '2019-1', '2019-1', '2019-10', '2019-1', '2019-4', '2019', '2018-3', '2019-2', '2019', '2019-1', '2019-1', '2019-1', '2019-5', '2019-3', '2019-2', '2019-3', '2019-5', '2019-

In [20]:
index = set(all_date_published)
for item in index:
    print(f"{item}: {all_date_published.count(item)}")

2020: 17
2017-9: 17
2018-6: 101
2017-11: 18
2019-5: 1828
2016-12: 3
2018-11: 397
2019-6: 1899
2016-4: 2
2020-7: 37
2021-3: 5
2018-12: 434
2018-3: 64
2019-9: 1756
2019-4: 1806
2017-4: 2
2021-8: 1
2021: 1
2018: 6
2017-6: 8
2019-1: 1756
2019-10: 1566
2017-12: 23
2016-10: 4
2020-2: 285
2019: 441
2018-8: 140
2019-11: 1633
2018-9: 210
2017-5: 7
2020-12: 11
2017-3: 2
2018-2: 31
2016-5: 2
2021-2: 3
2018-5: 75
2020-4: 141
2016-2: 1
2017-8: 7
2019-8: 1787
2018-10: 321
2016-1: 1
2019-12: 1562
2016-11: 1
2020-5: 89
2021-9: 1
2021-5: 1
2021-4: 1
2020-11: 18
2017-10: 13
2015-12: 1
2019-2: 1625
2020-1: 333
2019-3: 1803
2018-1: 23
NaN: 189
2020-6: 61
2018-4: 67
2017-7: 8
2021-1: 8
2020-3: 198
2018-7: 96
2020-8: 21
2020-10: 6
2016-8: 2
2019-7: 1932
2020-9: 20
2015-9: 1
2016-6: 1
2017-1: 5
