In [15]:
import pandas as pd
import os
from bokeh.charts import Bar, Donut, output_file, show
from bokeh.io import output_notebook
output_notebook()
from collections import defaultdict

os.chdir('C:/Users/Poto/Desktop')
d = pd.read_csv('pubmed82.csv')

In [2]:
d.columns

Index(['_id', 'author', 'date', 'journal', 'keywords', 'org', 'country',
       'title', 'url', 'Unnamed: 9'],
      dtype='object')

In [3]:
# Data preprocessing (Since the org data is quite messy, I only picked the firt 300 article information)
city = defaultdict(int)
city_list = set([])
for x in d['country'][:296]:
    # Get rid of '.'
    x = x.replace('.','')
    for y in x.split(';'):
        city[y.strip(' ')] += 1
        city_list.add(y.strip(' '))

In [4]:
# Sort first 300 cities and pick cities with the top appearance (appreace > 2 times)
sorted_city_list = sorted(city.items(), key=lambda x: x[1], reverse= True)
city_list_top = [x for x in sorted_city_list if x[1] > 2]

city_list_top

[('Boston, MA, USA', 40),
 ('Salt Lake City, UT, USA', 25),
 ('New York, NY, USA', 23),
 ('Rochester, MN, USA', 17),
 ('Houston, TX, USA', 17),
 ('Seattle, WA, USA', 15),
 ('Chicago, IL, USA', 15),
 ('Bethesda, MD, USA', 14),
 ('London, UK', 12),
 ('Nashville, TN, USA', 11),
 ('ORGANIZATION_NA', 11),
 ('Minneapolis, MN, USA', 10),
 ('Pittsburgh, PA, USA', 10),
 ('Philadelphia, PA, USA', 9),
 ('Beijing, China', 7),
 ('Ann Arbor, MI, USA', 7),
 ('Washington, DC, USA', 7),
 ('Stanford, CA, USA', 7),
 ('Los Angeles, CA, USA', 7),
 ('Atlanta, GA, USA', 6),
 ('Silver Spring, MD, USA', 6),
 ('Albany, NY, USA', 6),
 ('Cincinnati, OH, USA', 6),
 ('Birmingham, AL, USA', 6),
 ('Lebanon, NH, USA', 5),
 ('Sydney, Australia', 5),
 ('Baltimore, MD, USA', 5),
 ('Columbus, OH, USA', 5),
 ('Paris, France', 5),
 ('Charleston, SC, USA', 4),
 ('Hanover, NH, USA', 4),
 ('Palo Alto, CA, USA', 4),
 ('Cambridge, MA, USA', 4),
 ('Shenzhen, China', 4),
 ('Portland, OR, USA', 4),
 ('Cambridge, UK', 4),
 ('Marshfi

In [5]:
# Get selected city list
unique_list =[x[0] for x in city_list_top]

In [6]:
# Find relationship between each cities
# To avoid duplicate, cities will only list cities in an alphabetical order
relation = {}

for x in unique_list:
    relation[x] = defaultdict(int)

for x in d['country'][:296]:
    x = x.replace('.','')
    temp = []
    for y in x.split(';'):
        temp.append(y.strip())
    
    for z in temp:
        if z in unique_list:
            for m in temp:
                if m != z and m > z and m in unique_list:
                    relation[z][m] += 1

In [7]:
# Adapt a little bit above code to get all the cities with their cooperation cities
relat = {}

for x in unique_list:
    relat[x] = defaultdict(int)

for x in d['country'][:296]:
    x = x.replace('.','')
    temp = []
    for y in x.split(';'):
        temp.append(y.strip())
    
    for z in temp:
        if z in unique_list:
            for m in temp:
                if m != z and m in unique_list:
                    relat[z][m] += 1

In [8]:
# Calcualte how many cooperation cities each city has
temp = sorted(relat.items(), key = lambda x:len(x[1]), reverse = True)
for v in temp:
    print(v[0], len(v[1]))

Boston, MA, USA 25
New York, NY, USA 25
Seattle, WA, USA 23
Rochester, MN, USA 20
Salt Lake City, UT, USA 18
Chicago, IL, USA 18
Atlanta, GA, USA 16
Houston, TX, USA 15
Nashville, TN, USA 15
Philadelphia, PA, USA 15
Stanford, CA, USA 14
Bethesda, MD, USA 12
Minneapolis, MN, USA 11
Ann Arbor, MI, USA 11
Palo Alto, CA, USA 11
Cincinnati, OH, USA 10
Marshfield, WI, USA 10
Pittsburgh, PA, USA 9
Lebanon, NH, USA 9
Charleston, SC, USA 9
Detroit, MI, USA 8
Beijing, China 7
Birmingham, AL, USA 7
Baltimore, MD, USA 7
Cambridge, UK 7
Cleveland, OH, USA 7
Madrid, Spain 7
London, UK 6
Los Angeles, CA, USA 6
Sydney, Australia 6
Singapore 6
Portland, OR, USA 5
Seoul, Korea 5
West Haven, CT, USA 4
Denver, CO, USA 4
San Diego, CA, USA 4
Washington, DC, USA 3
Cambridge, MA, USA 3
Shanghai, China 3
Buffalo, NY, USA 3
Albany, NY, USA 2
Paris, France 2
Hanover, NH, USA 2
New Haven, CT, USA 2
Providence, RI, USA 2
Orsay, France 2
Austin, TX, USA 2
Taipei, Taiwan 2
Silver Spring, MD, USA 1
Columbus, OH, USA

In [9]:
# Get all the country names and their counts > 2
country_count = defaultdict(int)
count_list = [(x[0], len(x[1])) for x in temp if len(x[1]) > 2]

for x in count_list:
    cname = x[0].split(',')[-1:]
    temp_count = x[1]
    country_count[cname[0].strip()] += temp_count

country_count

defaultdict(int,
            {'Australia': 6,
             'China': 10,
             'Korea': 5,
             'Singapore': 6,
             'Spain': 7,
             'UK': 13,
             'USA': 357})

In [10]:
# Plot pie chart
data = pd.Series(list(dict(country_count).values()), index = list(dict(country_count).keys()))
pie_chart = Donut(data, title = 'Number of Papers Published by Countries in Clinical NLP')
show(pie_chart)

In [10]:
# Plot number of papers published each year
yearPub = [x.split()[0] for x in d['date']]

yearCount = defaultdict(int)
for y in yearPub:
    if y.isdigit():
        yearCount[y] += 1

countFrame = pd.DataFrame()
countFrame['Year'] = [int(x) for x in yearCount.keys()]
countFrame['Count'] = [int(x) for x in yearCount.values()]
countFrame

Unnamed: 0,Year,Count
0,2017,120
1,2016,135
2,2015,248
3,2014,167
4,2013,179
5,2012,113
6,2011,69
7,2010,81
8,2009,73
9,2008,141


In [14]:
# Plot bar chart
p = Bar(countFrame, 'Year', values='Count', title="Yearly Published Clinical NLP Papers")
show(p)

In [11]:
# Processing author name
author_name_all = [phrase.split(',') for phrase in d['author']]
author_list = []
for x in author_name_all:
    author_list += x

author_set = set()
author_collect = []
for y in author_list:
    edit = y.strip().replace('.', '')
    author_set.add(edit)
    author_collect.append(edit)

print('Numbers of Researchers:', len(author_set))

Numbers of Researchers: 4725


In [12]:
# Count appearance of each authors and pick top 30
author_count = defaultdict(int)
for x in author_collect:
    author_count[x] += 1

author30 = sorted(author_count.items(), key=lambda x: x[1], reverse=True)[:30]
author30

[('Xu H', 57),
 ('Friedman C', 55),
 ('Denny JC', 51),
 ('Liu H', 48),
 ('Chute CG', 41),
 ('Hripcsak G', 36),
 ('Chapman WW', 35),
 ('Melton GB', 26),
 ('Fiszman M', 24),
 ('Savova GK', 23),
 ('Sohn S', 22),
 ('Haug PJ', 21),
 ('Wang Y', 21),
 ('Johnson SB', 21),
 ('Zhou L', 20),
 ('South BR', 19),
 ('Elhadad N', 18),
 ('Solti I', 18),
 ('Rindflesch TC', 17),
 ('Pakhomov S', 17),
 ('Shen S', 17),
 ('Cimino JJ', 16),
 ('Jiang M', 16),
 ('Meystre SM', 16),
 ('Lingren T', 15),
 ('Demner-Fushman D', 15),
 ('Chen ES', 15),
 ('Zweigenbaum P', 15),
 ('Elkin PL', 15),
 ('Uzuner O', 15)]

In [13]:
authorFrame = pd.DataFrame()
authorFrame['Author'] = [x[0] for x in author30]
authorFrame['Count'] = [int(x[1]) for x in author30]
authorFrame

Unnamed: 0,Author,Count
0,Xu H,57
1,Friedman C,55
2,Denny JC,51
3,Liu H,48
4,Chute CG,41
5,Hripcsak G,36
6,Chapman WW,35
7,Melton GB,26
8,Fiszman M,24
9,Savova GK,23


In [17]:
# Plot bar chart
p = Bar(authorFrame, 'Author', values='Count', title="Number of Papers Published by Authors")
show(p)

In [19]:
# Without much data cleaning, it seems there are 276 different journals
print('Number of Journals:', len(set(d['journal'])))

Number of Journals: 278


In [20]:
# Pick journals by its number of appearance
jour = defaultdict(int)
for x in d['journal']:
    jour[x] += 1
sorted(jour.items(), key=lambda x: x[1], reverse=True)[:10]

[('Studies in health technology and informatics', 302),
 ('AMIA ... Annual Symposium proceedings. AMIA Symposium', 263),
 ('Journal of the American Medical Informatics Association : JAMIA', 201),
 ('Journal of biomedical informatics', 169),
 ('Methods of information in medicine', 41),
 ('International journal of medical informatics', 38),
 ('BMC medical informatics and decision making', 34),
 ('Proceedings. AMIA Symposium', 25),
 ('AMIA Joint Summits on Translational Science proceedings. AMIA Joint Summits on Translational Science',
  21),
 ('BMC bioinformatics', 21)]