# 1. Data Selection and Cleaning

In [1]:
# Import Python Packages required for analysis
import pandas as pd
import numpy as np
import plotly.express as px
import warnings 
warnings.filterwarnings('ignore')

In [1]:
1+2+3+6

12

In [2]:
#Download the file https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-21.tar.gz
# and point to the path here
data = 'data/covid-19-literature/7-jul/2020-07-21'

In [3]:
# Load CSV file containing metadata
df_csv = pd.read_csv(data+'/metadata.csv')

In [4]:
df_csv.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472600.0,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11668000.0,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11668000.0,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686900.0,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686900.0,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [5]:
# Number of entries in the metadata file are 1,96,630
len(df_csv)

196630

In [6]:
# Select relevant columns that are needed for analysis 
df_csv = df_csv[['cord_uid','title','doi','abstract','publish_time','authors','journal','doi','pmcid','pubmed_id']]
df_csv.head()

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,doi.1,pmcid,pubmed_id
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,10.1186/1471-2334-1-6,PMC35282,11472600.0
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,10.1186/rr14,PMC59543,11668000.0
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,10.1186/rr19,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,10.1186/rr19,PMC59549,11668000.0
3,2b73a28n,Role of endothelin-1 in lung disease,10.1186/rr44,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,10.1186/rr44,PMC59574,11686900.0
4,9785vg6d,Gene expression in epithelial cells in respons...,10.1186/rr61,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,10.1186/rr61,PMC59580,11686900.0


In [7]:
# Save the selected data in a new file
df_csv.to_csv(data+'/abstracts.csv',index=None)

In [8]:
# Read the selected data to see if it is saved properly 
df_csv = pd.read_csv(data+'/abstracts.csv')

In [9]:
# Removing articles without abstracts
print(len(df_csv))
df_csv = df_csv[~df_csv['abstract'].isnull()]
print(len(df_csv))

196630
140330


In [10]:
# Remvoning the section keywords from the abstracts
df_csv['abstract'] = df_csv['abstract'].apply(lambda x: 
                                          x.replace('BACKGROUND:','').replace('BACKGROUNDS:','').replace('OBJECTIVES:','')
                                          .replace('OBJECTIVE:','').replace('METHODS:','').replace('METHOD:','')
                                          .replace('RESULTS:','').replace('RESULT:','')
                                          .replace('CONCLUSION:','').replace('CONCLUSIONS:',''))

In [11]:
# Converting abstract to lower case
df_csv['abstract'] = df_csv['abstract'].apply(lambda x: x.lower())
# This removes the lines that contains text "this article is protected by copyright. all rights reserved"
df_csv['abstract'] = df_csv['abstract'].apply(lambda x: x.replace('this article is protected by copyright. all rights reserved',''))

In [12]:
# Saving the file
df_csv.to_csv(data+'/abstract_final.csv', index=None)

In [13]:
# Reading back to see if it is saved correctly
df = pd.read_csv(data+'/abstract_final.csv')
df.head()

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,doi.1,pmcid,pubmed_id
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,this retrospective chart review describes the...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,10.1186/1471-2334-1-6,PMC35282,11472600.0
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,10.1186/rr14,PMC59543,11668000.0
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,10.1186/rr19,surfactant protein-d (sp-d) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,10.1186/rr19,PMC59549,11668000.0
3,2b73a28n,Role of endothelin-1 in lung disease,10.1186/rr44,endothelin-1 (et-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,10.1186/rr44,PMC59574,11686900.0
4,9785vg6d,Gene expression in epithelial cells in respons...,10.1186/rr61,respiratory syncytial virus (rsv) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,10.1186/rr61,PMC59580,11686900.0


In [14]:
# Converting the timestamp string to date format, which Python can process
df['publish_time_new'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%d')

In [15]:
# Removing articles that are published before 1 Jan 2020. The remaining articles are 40665
import datetime
df= df[df['publish_time_new']>'2020-01-01']
len(df)

40665

In [16]:
# Detecting and removing articles with abstracts written in other than English
# The left-over articles are 40078
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
def langdet (x):
    try:
        return detect(x)
    except:
        return "NA"
df['lang'] = df['abstract'].apply(lambda x: langdet(x))
df = df[df['lang'].str.contains('en')]
df.to_csv(data+'/articles_clean_text_eng.csv', index=None)
len(df)

40078

In [17]:
# This code cleans the data for further analysis
# It removes wired characters, and punctuations
# It also lemmatize and finds word stems
# Finally it converts the clean abstract to individual words (tokens) and finds, unigrams, bigrams and trigrams
import re
import nltk
import string
from textblob import TextBlob
# nltk.download('stopwords')
# nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words('english')
my_file = open("stopwords.txt", "r")
content = my_file.read().split('\n')
stopword.extend(content)
stopword = list(set(stopword))
stopword = [w.strip() for w in stopword]
stopword = set(stopword)
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
from nltk import bigrams, trigrams

def removeWeirdChars(text):
    weridPatterns = re.compile("["u"\U0001F600-\U0001F64F"u"\U0001F300-\U0001F5FF"u"\U0001F680-\U0001F6FF"u"\U0001F1E0-\U0001F1FF"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251"u"\U0001f926-\U0001f937"u'\U00010000-\U0010ffff'
                               u"\u200d"u"\u2640-\u2642"u"\u2600-\u2B55"u"\u23cf"u"\u23e9"u"\u231a"u"\u3030"u"\ufe0f"u"\u2069"u"\u2066"u"\u200c"u"\u2068"u"\u2067""]+", flags=re.UNICODE)
    return weridPatterns.sub(r'', text)
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text
def tokenization(text):
    text = text.split()#re.split('\W+', text)
    text = ','.join(set(text))
    return text
def remove_stopwords(text):
    text = [word.strip() for word in text.split() if word not in stopword]
    text = ' '.join(text)
    return text
def stemming(text):
    text = [ps.stem(word) for word in text.split()]
    text = ' '.join(text)
    return text

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text.split()]
    text = ' '.join(text)
    return text
def clean_text(text):
    text_lc = " ".join([word.lower() for word in text.split() if word not in string.punctuation]) # remove puntuation
    text_rc = re.sub('[0-9]+', '', text_lc)
    tokens = re.split('\W+', text_rc)    # tokenization
    text = [word for word in tokens if word not in stopword]  # remove stopwords and stemming
    text = ' '.join(text)
    return text


df['title'] = df['title'].apply(lambda x: x.lower())
df['clean_text'] = df['abstract'].apply(lambda x: clean_text(x))
df['unigram'] = df['clean_text'].apply(lambda x: tokenization(x))
df['bigram']  = df['unigram'].apply(lambda x: ','.join([st[0].strip()+" "+st[1].strip() for st in list(bigrams(x.split(',')))]))
df['trigram'] = df['unigram'].apply(lambda x: ','.join([st[0].strip()+" "+st[1].strip()+" "+st[2].strip() for st in list(trigrams(x.split(',')))]))

In [19]:
# Saving the clean and tokenized abstracts
df.to_csv(data+'/articles_clean_text_eng.csv', index=None)
len(df)

40078

In [20]:
# Through our manual checking, we found that there are still many non-relevant articles that exist in the dataset
# This code selects articles that are relevant to COVID-19 only
# The search terms are coronavirus, covid, 2019-ncov, sars-cov
# The search is applied to both title and abstract with OR condition
# There are 29714 articles with COVID-19 related terms
df = df[((df['abstract'].str.contains('coronavirus|covid|2019-ncov|sars-cov'))|
         (df['title'].str.contains('coronavirus|covid|2019-ncov|sars-cov')))]
len(df)

29714

In [21]:
# Saving the data
df.to_csv(data+'/Selected_articles_clean_text_eng.csv', index=None)
len(df)

29714

In [22]:
# Reading back the data
df = pd.read_csv(data+'/Selected_articles_clean_text_eng.csv')

In [23]:
# Removing Duplicates based on same title
has_dup = df.duplicated(subset ="title", keep=False)

In [24]:
# Sparating duplicate titles
print(len(df))
dup = df[has_dup]
df = df[~has_dup]
len(dup),len(df)

29714


(1124, 28590)

In [25]:
dup = dup.fillna('-999')

In [26]:
# Fill articles metdata missing fields with -999 to be searchable
dup1 = dup[~((dup['journal'].str.contains('-999'))|((dup['pmcid'].str.contains('-999')))
               |((dup['pubmed_id'].str.contains('-999')))|((dup['doi'].str.contains('-999'))))]
len(dup1), len(dup)

(351, 1124)

In [27]:
# The duplicate articles have 314 unique articles
dup1 = dup1.drop_duplicates(subset ="title", keep='first')
len(dup1)

314

In [28]:
# Combining the unique articles from duplicate entries and non-duplicate articles
# Total articles are now 28904, which forms our study data
df1 = pd.concat([df,dup1])
len(df1),len(df),len(dup1)

(28904, 28590, 314)

In [29]:
df1

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,doi.1,pmcid,pubmed_id,publish_time_new,lang,clean_text,unigram,bigram,trigram
0,2jq626ye,therapeutic strategies in an outbreak scenario...,10.12688/f1000research.22211.2,a novel coronavirus (2019-ncov) originating in...,2020-02-07,"Kruse, Robert L.",F1000Res,10.12688/f1000research.22211.2,PMC7029759,32117569.0,2020-02-07,en,coronavirus ncov originating wuhan china prese...,"patients,genome,speed,breath,neutralizing,effo...","patients genome,genome speed,speed breath,brea...","patients genome speed,genome speed breath,spee..."
1,270msv5l,baricitinib - a januase kinase inhibitor - not...,10.1016/j.ijantimicag.2020.105967,• several studies suggested baricitinib as a p...,2020-04-04,"Praveen, D.; Chowdary, Puvvada Ranadheer; Aana...",Int J Antimicrob Agents,10.1016/j.ijantimicag.2020.105967,PMC7128600,32259575.0,2020-04-04,en,studies suggested baricitinib potential drug ...,"act,patients,epidemiological,neutropenia,close...","act patients,patients epidemiological,epidemio...","act patients epidemiological,patients epidemio..."
3,542e5h1u,coronavirus disease: challenges for psychiatry,10.1192/bjp.2020.86,coronavirus disease (covid-19) presents two ur...,2020-04-15,"Kelly, Brendan D.",,10.1192/bjp.2020.86,PMC7205546,32293555.0,2020-04-15,en,coronavirus disease covid presents urgent heal...,"patients,good,illness,coronavirus,present,fami...","patients good,good illness,illness coronavirus...","patients good illness,good illness coronavirus..."
4,72s8wpla,practical considerations when performing neuro...,10.1080/21646821.2020.1756132,"the coronavirus disease 2019, sars-cov-2 (the ...",2020-05-06,"Haines, Seline; Caccamo, Amy; Chan, Fonda; Gal...",Neurodiagn J,10.1080/21646821.2020.1756132,PMC7212538,31967948.0,2020-05-06,en,coronavirus disease sars cov covid led worldwi...,"patients,droplet,procedure,analysis,procedures...","patients droplet,droplet procedure,procedure a...","patients droplet procedure,droplet procedure a..."
5,yv4x8viu,extracorporeal life support organization coron...,10.1097/mat.0000000000001193,the extracorporeal life support organization (...,2020-05-12,"Shekar, Kiran; Badulak, Jenelle; Peek, Giles; ...",ASAIO J,10.1097/mat.0000000000001193,PMC7228451,32358233.0,2020-05-12,en,extracorporeal life support organization elso ...,"context,regularly,practice,developed,oxygenati...","context regularly,regularly practice,practice ...","context regularly practice,regularly practice ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29436,24e53f13,role of a habitat's air humidity in covid-19 m...,10.1016/j.scitotenv.2020.138763,abstract transient local over-dry environment ...,2020-05-22,"Biktasheva, Irina V.",Sci Total Environ,10.1016/j.scitotenv.2020.138763,PMC7242208,32492610,2020-05-22,en,abstract transient local dry environment contr...,"negatively,transient,propose,dry,data,humidity...","negatively transient,transient propose,propose...","negatively transient propose,transient propose..."
29462,jp0q6qyq,a unifying structural and functional model of ...,10.1371/journal.pbio.3000715,"zoonotic coronavirus (cov) infections, such as...",2020-06-08,"Snijder, Eric J.; Limpens, Ronald W. A. L.; de...",PLoS Biol,10.1371/journal.pbio.3000715,PMC7302735,32511245,2020-06-08,en,zoonotic coronavirus cov infections responsibl...,"transformed,remains,establish,assortment,specu...","transformed remains,remains establish,establis...","transformed remains establish,remains establis..."
29557,uiq70lz4,core outcome set for clinical trials of covid-...,10.3389/fphar.2020.00781,development of a core outcome set (cos) for c...,2020-05-25,"Qiu, Ruijin; Zhao, Chen; Liang, Tengxiao; Hao,...",Front Pharmacol,10.3389/fphar.2020.00781,PMC7265660,32574235,2020-05-25,en,development core outcome set cos clinical tria...,"patients,recovery,review,eligible,imaging,invi...","patients recovery,recovery review,review eligi...","patients recovery review,recovery review eligi..."
29626,l1269llu,seroprevalence of antibodies against sars-cov-...,10.1038/s41467-020-17318-x,health care workers (hcw) are a high-risk popu...,2020-07-08,"Garcia-Basteiro, Alberto L.; Moncunill, Gemma;...",Nat Commun,10.1038/s41467-020-17318-x,PMC7343863,32641730,2020-07-08,en,health care workers hcw high risk population a...,"covid,patients,high,population,participants,sa...","covid patients,patients high,high population,p...","covid patients high,patients high population,h..."


In [31]:
# Storing the clean finalized data into a separate file
df1 = df1.reset_index()
df1 = df1.drop(['doi.1'], axis=1)
df1.to_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed.csv', index=None)

In [None]:
# End of Data Selection and Cleaning 

# 2. Data Analysis (topic clustering & data extraction)

In [46]:
#Import Python Packages
import pandas as pd
import numpy as np
import plotly.express as px
import warnings 
warnings.filterwarnings('ignore')

In [47]:
# Path to data
data = 'data/covid-19-literature/7-jul/2020-07-21'

In [48]:
# Loading cleaned and finalized data
df = pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed.csv').fillna('') 

In [49]:
len(df)

28904

In [36]:
df.head()

Unnamed: 0,index,cord_uid,title,doi,abstract,publish_time,authors,journal,pmcid,pubmed_id,publish_time_new,lang,clean_text,unigram,bigram,trigram
0,0,2jq626ye,therapeutic strategies in an outbreak scenario...,10.12688/f1000research.22211.2,a novel coronavirus (2019-ncov) originating in...,2020-02-07,"Kruse, Robert L.",F1000Res,PMC7029759,32117569.0,2020-02-07,en,coronavirus ncov originating wuhan china prese...,"patients,genome,speed,breath,neutralizing,effo...","patients genome,genome speed,speed breath,brea...","patients genome speed,genome speed breath,spee..."
1,1,270msv5l,baricitinib - a januase kinase inhibitor - not...,10.1016/j.ijantimicag.2020.105967,• several studies suggested baricitinib as a p...,2020-04-04,"Praveen, D.; Chowdary, Puvvada Ranadheer; Aana...",Int J Antimicrob Agents,PMC7128600,32259575.0,2020-04-04,en,studies suggested baricitinib potential drug ...,"act,patients,epidemiological,neutropenia,close...","act patients,patients epidemiological,epidemio...","act patients epidemiological,patients epidemio..."
2,3,542e5h1u,coronavirus disease: challenges for psychiatry,10.1192/bjp.2020.86,coronavirus disease (covid-19) presents two ur...,2020-04-15,"Kelly, Brendan D.",,PMC7205546,32293555.0,2020-04-15,en,coronavirus disease covid presents urgent heal...,"patients,good,illness,coronavirus,present,fami...","patients good,good illness,illness coronavirus...","patients good illness,good illness coronavirus..."
3,4,72s8wpla,practical considerations when performing neuro...,10.1080/21646821.2020.1756132,"the coronavirus disease 2019, sars-cov-2 (the ...",2020-05-06,"Haines, Seline; Caccamo, Amy; Chan, Fonda; Gal...",Neurodiagn J,PMC7212538,31967948.0,2020-05-06,en,coronavirus disease sars cov covid led worldwi...,"patients,droplet,procedure,analysis,procedures...","patients droplet,droplet procedure,procedure a...","patients droplet procedure,droplet procedure a..."
4,5,yv4x8viu,extracorporeal life support organization coron...,10.1097/mat.0000000000001193,the extracorporeal life support organization (...,2020-05-12,"Shekar, Kiran; Badulak, Jenelle; Peek, Giles; ...",ASAIO J,PMC7228451,32358233.0,2020-05-12,en,extracorporeal life support organization elso ...,"context,regularly,practice,developed,oxygenati...","context regularly,regularly practice,practice ...","context regularly practice,regularly practice ..."


In [37]:
# Preparing abstract for clustering using TfidfVectorizer
# Parameters are set as follows max_df=0.90 and min_df=10
# It also performed steming if some words are missed out previously
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
# nltk.download('punkt')
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=10, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,2))


%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'].tolist()) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 51.4 s, sys: 268 ms, total: 51.6 s
Wall time: 51.7 s
(28904, 42061)


In [None]:
# This code finds the possible number of clusters in the data using Elbow method
# The code produce 26 clusters
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from yellowbrick.cluster import KElbowVisualizer
from sklearn.decomposition import TruncatedSVD

pca = TruncatedSVD(n_components=200)
X = pca.fit_transform(tfidf_matrix)


# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(10,30))

visualizer.fit(X)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [38]:
# We used 26 as input to KMean clustering as suggested by Elbow method that there are 26 clusters.
X = tfidf_matrix
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 26, init = 'k-means++', random_state = 0)
km.fit(X)
predict = km.predict(X)

In [55]:
# Creating copies of cluster centroids
import copy
cent1, cent2= copy.deepcopy(km.cluster_centers_),copy.deepcopy(km.cluster_centers_)

In [69]:
# Preparing to delete the following clusters from the cluster assignments
# 5, 18, 19, 22, 23
cent= np.delete(cent1, [5, 18, 19, 22, 23], 0)
len(cent), len(cent1)

(21, 26)

In [70]:
# Modifying the output of KMean clustering to remove irrelevant clusters
# The remaning clusters are 21
km.cluster_centers_ = cent
km.n_clusters = 21

In [124]:
# Assigning the abstract to the new clusters
predict = km.predict(X)

In [125]:
df = pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered.csv').fillna('') 

In [126]:
len(df),len(predict)

(28904, 28904)

In [127]:
# Assigning cluster numbers to abstracts 
df['cluster'] = pd.Series(predict, index = df.index)
df.head()

Unnamed: 0,index,cord_uid,title,doi,abstract,publish_time,authors,journal,pmcid,pubmed_id,publish_time_new,lang,clean_text,unigram,bigram,trigram,cluster
0,0,2jq626ye,therapeutic strategies in an outbreak scenario...,10.12688/f1000research.22211.2,a novel coronavirus (2019-ncov) originating in...,2020-02-07,"Kruse, Robert L.",F1000Res,PMC7029759,32117569.0,2020-02-07,en,coronavirus ncov originating wuhan china prese...,"patients,genome,speed,breath,neutralizing,effo...","patients genome,genome speed,speed breath,brea...","patients genome speed,genome speed breath,spee...",3
1,1,270msv5l,baricitinib - a januase kinase inhibitor - not...,10.1016/j.ijantimicag.2020.105967,• several studies suggested baricitinib as a p...,2020-04-04,"Praveen, D.; Chowdary, Puvvada Ranadheer; Aana...",Int J Antimicrob Agents,PMC7128600,32259575.0,2020-04-04,en,studies suggested baricitinib potential drug ...,"act,patients,epidemiological,neutropenia,close...","act patients,patients epidemiological,epidemio...","act patients epidemiological,patients epidemio...",18
2,3,542e5h1u,coronavirus disease: challenges for psychiatry,10.1192/bjp.2020.86,coronavirus disease (covid-19) presents two ur...,2020-04-15,"Kelly, Brendan D.",,PMC7205546,32293555.0,2020-04-15,en,coronavirus disease covid presents urgent heal...,"patients,good,illness,coronavirus,present,fami...","patients good,good illness,illness coronavirus...","patients good illness,good illness coronavirus...",17
3,4,72s8wpla,practical considerations when performing neuro...,10.1080/21646821.2020.1756132,"the coronavirus disease 2019, sars-cov-2 (the ...",2020-05-06,"Haines, Seline; Caccamo, Amy; Chan, Fonda; Gal...",Neurodiagn J,PMC7212538,31967948.0,2020-05-06,en,coronavirus disease sars cov covid led worldwi...,"patients,droplet,procedure,analysis,procedures...","patients droplet,droplet procedure,procedure a...","patients droplet procedure,droplet procedure a...",12
4,5,yv4x8viu,extracorporeal life support organization coron...,10.1097/mat.0000000000001193,the extracorporeal life support organization (...,2020-05-12,"Shekar, Kiran; Badulak, Jenelle; Peek, Giles; ...",ASAIO J,PMC7228451,32358233.0,2020-05-12,en,extracorporeal life support organization elso ...,"context,regularly,practice,developed,oxygenati...","context regularly,regularly practice,practice ...","context regularly practice,regularly practice ...",12


In [128]:
# Abstracts or artciles per cluster
df['cluster'].value_counts()

0     5393
12    5118
15    3313
18    1837
2     1823
4     1602
7     1141
9     1098
17     915
1      874
10     868
16     816
20     747
19     706
3      584
8      441
11     411
6      350
13     336
14     312
5      219
Name: cluster, dtype: int64

In [130]:
# Combining Cluster 7 and 2 to make one cluster
def recluster(x):
    if x==7:
        return 2
    return x
df['cluster'] = df['cluster'].apply(lambda x: recluster(x))

In [131]:
# Re numbering the cluster to be from 0 to 19
def renumber(x):
    if x>7:
        return x-1
    return x
df['cluster'] = df['cluster'].apply(lambda x: renumber(x))

In [132]:
df['cluster'].value_counts()

0     5393
11    5118
14    3313
2     2964
17    1837
4     1602
8     1098
16     915
1      874
9      868
15     816
19     747
18     706
3      584
7      441
10     411
6      350
12     336
13     312
5      219
Name: cluster, dtype: int64

In [133]:
# Stoing the clutering results
df.to_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered.csv',index=None)

In [134]:
df

Unnamed: 0,index,cord_uid,title,doi,abstract,publish_time,authors,journal,pmcid,pubmed_id,publish_time_new,lang,clean_text,unigram,bigram,trigram,cluster
0,0,2jq626ye,therapeutic strategies in an outbreak scenario...,10.12688/f1000research.22211.2,a novel coronavirus (2019-ncov) originating in...,2020-02-07,"Kruse, Robert L.",F1000Res,PMC7029759,32117569.0,2020-02-07,en,coronavirus ncov originating wuhan china prese...,"patients,genome,speed,breath,neutralizing,effo...","patients genome,genome speed,speed breath,brea...","patients genome speed,genome speed breath,spee...",3
1,1,270msv5l,baricitinib - a januase kinase inhibitor - not...,10.1016/j.ijantimicag.2020.105967,• several studies suggested baricitinib as a p...,2020-04-04,"Praveen, D.; Chowdary, Puvvada Ranadheer; Aana...",Int J Antimicrob Agents,PMC7128600,32259575.0,2020-04-04,en,studies suggested baricitinib potential drug ...,"act,patients,epidemiological,neutropenia,close...","act patients,patients epidemiological,epidemio...","act patients epidemiological,patients epidemio...",17
2,3,542e5h1u,coronavirus disease: challenges for psychiatry,10.1192/bjp.2020.86,coronavirus disease (covid-19) presents two ur...,2020-04-15,"Kelly, Brendan D.",,PMC7205546,32293555.0,2020-04-15,en,coronavirus disease covid presents urgent heal...,"patients,good,illness,coronavirus,present,fami...","patients good,good illness,illness coronavirus...","patients good illness,good illness coronavirus...",16
3,4,72s8wpla,practical considerations when performing neuro...,10.1080/21646821.2020.1756132,"the coronavirus disease 2019, sars-cov-2 (the ...",2020-05-06,"Haines, Seline; Caccamo, Amy; Chan, Fonda; Gal...",Neurodiagn J,PMC7212538,31967948.0,2020-05-06,en,coronavirus disease sars cov covid led worldwi...,"patients,droplet,procedure,analysis,procedures...","patients droplet,droplet procedure,procedure a...","patients droplet procedure,droplet procedure a...",11
4,5,yv4x8viu,extracorporeal life support organization coron...,10.1097/mat.0000000000001193,the extracorporeal life support organization (...,2020-05-12,"Shekar, Kiran; Badulak, Jenelle; Peek, Giles; ...",ASAIO J,PMC7228451,32358233.0,2020-05-12,en,extracorporeal life support organization elso ...,"context,regularly,practice,developed,oxygenati...","context regularly,regularly practice,practice ...","context regularly practice,regularly practice ...",11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28899,29436,24e53f13,role of a habitat's air humidity in covid-19 m...,10.1016/j.scitotenv.2020.138763,abstract transient local over-dry environment ...,2020-05-22,"Biktasheva, Irina V.",Sci Total Environ,PMC7242208,32492610,2020-05-22,en,abstract transient local dry environment contr...,"negatively,transient,propose,dry,data,humidity...","negatively transient,transient propose,propose...","negatively transient propose,transient propose...",0
28900,29462,jp0q6qyq,a unifying structural and functional model of ...,10.1371/journal.pbio.3000715,"zoonotic coronavirus (cov) infections, such as...",2020-06-08,"Snijder, Eric J.; Limpens, Ronald W. A. L.; de...",PLoS Biol,PMC7302735,32511245,2020-06-08,en,zoonotic coronavirus cov infections responsibl...,"transformed,remains,establish,assortment,specu...","transformed remains,remains establish,establis...","transformed remains establish,remains establis...",19
28901,29557,uiq70lz4,core outcome set for clinical trials of covid-...,10.3389/fphar.2020.00781,development of a core outcome set (cos) for c...,2020-05-25,"Qiu, Ruijin; Zhao, Chen; Liang, Tengxiao; Hao,...",Front Pharmacol,PMC7265660,32574235,2020-05-25,en,development core outcome set cos clinical tria...,"patients,recovery,review,eligible,imaging,invi...","patients recovery,recovery review,review eligi...","patients recovery review,recovery review eligi...",8
28902,29626,l1269llu,seroprevalence of antibodies against sars-cov-...,10.1038/s41467-020-17318-x,health care workers (hcw) are a high-risk popu...,2020-07-08,"Garcia-Basteiro, Alberto L.; Moncunill, Gemma;...",Nat Commun,PMC7343863,32641730,2020-07-08,en,health care workers hcw high risk population a...,"covid,patients,high,population,participants,sa...","covid patients,patients high,high population,p...","covid patients high,patients high population,h...",10


In [135]:
# This code extracts top unigram, bigram and trigrams from each cluster 
# to be eveluated for labeling the clusters manually by experts
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import math
clusters = []
for i in range(20):
#     unigrames
    print('Topic '+str(i)+' :')
    terms = df[df['cluster']==i]['unigram'].tolist()
    stats = 'Topic '+str(i)+' stats: '+ str(round(len(terms)/len(df)*100,2))+"% ("+str(len(terms))+"/"+str(len(df))+")"
    print(stats)
    print(i)
    cnt = Counter([x.strip() for st in terms for x in st.split(',')])
    del cnt['']
    counter = cnt.most_common(100)
    uni = ', '.join([val[0] for val in counter])
    print('Topic'+str(i)+' top unigrams : '+uni)
    print('\n')
    
    #     bigrames
    terms = df[df['cluster']==i]['bigram'].tolist()
    cnt = Counter([x.strip() for st in terms for x in st.split(',')])
    del cnt['']
    counter = cnt.most_common(100)
    bi = ', '.join([val[0] for val in counter])
    print('Topic'+str(i)+' top bigrams : '+bi)
    print('\n')
    
    #     bigrames
    terms = df[df['cluster']==i]['trigram'].tolist()
    cnt = Counter([x.strip() for st in terms for x in st.split(',')])
    del cnt['']
    counter = cnt.most_common(100)
    tri = ', '.join([val[0] for val in counter])
    print('Topic'+str(i)+' top trigrams : '+tri)
    print('\n')
    clusters.append([stats,uni,bi,tri])
    
    print('\n\n\n\n\n')

Topic 0 :
Topic 0 stats: 18.66% (5393/28904)
0
Topic0 top unigrams : covid, pandemic, health, coronavirus, disease, public, data, world, spread, social, study, global, outbreak, countries, cases, measures, virus, sars, results, people, based, time, cov, china, research, related, risk, infection, impact, analysis, response, information, current, control, including, crisis, population, high, march, number, respiratory, severe, reported, paper, due, potential, community, economic, epidemic, future, transmission, significant, important, provide, article, system, human, emergency, methods, medical, case, level, care, caused, government, prevention, strategies, factors, effective, national, early, systems, online, clinical, evidence, international, deaths, present, acute, review, policy, challenges, wuhan, findings, effects, worldwide, states, higher, affected, role, confirmed, organization, work, support, syndrome, media, increased, abstract, country, studies


Topic0 top bigrams : crisis c

In [136]:
# Creating separate file for experts manual labeling of clusters
cluster_results = pd.DataFrame(clusters, columns=['Stats','Top100Unigrams','Top100Bigrams','Top100Trigrams'])
cluster_results

Unnamed: 0,Stats,Top100Unigrams,Top100Bigrams,Top100Trigrams
0,Topic 0 stats: 18.66% (5393/28904),"covid, pandemic, health, coronavirus, disease,...","crisis covid, health covid, coronavirus sars, ...","health crisis covid, clinical coronavirus sars..."
1,Topic 1 stats: 3.02% (874/28904),"covid, ct, patients, chest, disease, coronavir...","clinical coronavirus, ct covid, cases diagnosi...","clinical coronavirus conclusions, initial clin..."
2,Topic 2 stats: 10.25% (2964/28904),"covid, model, data, cases, number, epidemic, d...","study time, pandemic number, confirmed cases, ...","transmission confirmed cases, based study time..."
3,Topic 3 stats: 2.02% (584/28904),"ace, sars, cov, covid, angiotensin, enzyme, co...","clinical coronavirus, coronavirus sars, cov po...","cov potential disease, infection observed ace,..."
4,Topic 4 stats: 5.54% (1602/28904),"cov, sars, covid, pcr, positive, coronavirus, ...","diagnosis cov, clinical coronavirus, coronavir...","clinical coronavirus sars, cases diagnosis cov..."
5,Topic 5 stats: 0.76% (219/28904),"covid, transplant, recipients, patients, disea...","clinical coronavirus, coronavirus sars, transp...","clinical coronavirus sars, coronavirus sars pr..."
6,Topic 6 stats: 1.21% (350/28904),"covid, masks, mask, pandemic, protective, equi...","masks mask, methods health, covid significantl...","masks mask supply, methods health protective, ..."
7,Topic 7 stats: 1.53% (441/28904),"cancer, covid, patients, pandemic, treatment, ...","risk cancer, patient treatment, clinical coron...","risk breast cancer, risk date cancer, syndrome..."
8,Topic 8 stats: 3.8% (1098/28904),"covid, clinical, treatment, patients, trials, ...","clinical coronavirus, covid trials, cov potent...","clinical coronavirus sars, promising clinical ..."
9,Topic 9 stats: 3.0% (868/28904),"covid, social, distancing, pandemic, measures,...","distancing transmission, pandemic number, covi...","distancing transmission confirmed, distancing ..."


In [137]:
# Re-seting index
cluster_results['ClusterNumber'] = cluster_results.reset_index().index
cluster_results= cluster_results[['ClusterNumber','Stats','Top100Unigrams','Top100Bigrams','Top100Trigrams']]
cluster_results

Unnamed: 0,ClusterNumber,Stats,Top100Unigrams,Top100Bigrams,Top100Trigrams
0,0,Topic 0 stats: 18.66% (5393/28904),"covid, pandemic, health, coronavirus, disease,...","crisis covid, health covid, coronavirus sars, ...","health crisis covid, clinical coronavirus sars..."
1,1,Topic 1 stats: 3.02% (874/28904),"covid, ct, patients, chest, disease, coronavir...","clinical coronavirus, ct covid, cases diagnosi...","clinical coronavirus conclusions, initial clin..."
2,2,Topic 2 stats: 10.25% (2964/28904),"covid, model, data, cases, number, epidemic, d...","study time, pandemic number, confirmed cases, ...","transmission confirmed cases, based study time..."
3,3,Topic 3 stats: 2.02% (584/28904),"ace, sars, cov, covid, angiotensin, enzyme, co...","clinical coronavirus, coronavirus sars, cov po...","cov potential disease, infection observed ace,..."
4,4,Topic 4 stats: 5.54% (1602/28904),"cov, sars, covid, pcr, positive, coronavirus, ...","diagnosis cov, clinical coronavirus, coronavir...","clinical coronavirus sars, cases diagnosis cov..."
5,5,Topic 5 stats: 0.76% (219/28904),"covid, transplant, recipients, patients, disea...","clinical coronavirus, coronavirus sars, transp...","clinical coronavirus sars, coronavirus sars pr..."
6,6,Topic 6 stats: 1.21% (350/28904),"covid, masks, mask, pandemic, protective, equi...","masks mask, methods health, covid significantl...","masks mask supply, methods health protective, ..."
7,7,Topic 7 stats: 1.53% (441/28904),"cancer, covid, patients, pandemic, treatment, ...","risk cancer, patient treatment, clinical coron...","risk breast cancer, risk date cancer, syndrome..."
8,8,Topic 8 stats: 3.8% (1098/28904),"covid, clinical, treatment, patients, trials, ...","clinical coronavirus, covid trials, cov potent...","clinical coronavirus sars, promising clinical ..."
9,9,Topic 9 stats: 3.0% (868/28904),"covid, social, distancing, pandemic, measures,...","distancing transmission, pandemic number, covi...","distancing transmission confirmed, distancing ..."


In [138]:
# Storing file for experts to manually label clusters
cluster_results.to_csv(data+'/top_terms_in_clusters_new.csv',index=None)

In [139]:
# Each is also separated along the title, abstract and other data 
for i in range (20):
    terms = df[df['cluster']==i][['title','abstract','publish_time','authors','cluster']]
    terms.to_csv(data+'/cluster_new_'+str(i)+'.csv',index=None)

In [50]:
df = pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered.csv')

In [51]:
len(df)

28904

# 3. Further Analysis

In [1]:
#Import Python Packages
import pandas as pd
import numpy as np
import plotly.express as px
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# Path to data
data = 'data/covid-19-literature/7-jul/2020-07-21'

In [52]:
# Loading data
df = pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered.csv').fillna('')
len(df)

28904

In [4]:
# Converting the publication date to Python date format
df['publish_time_new'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%d')

In [5]:
# Finding week number and month of publication
df['week'] = df['publish_time_new'].dt.week
df['month'] = df['publish_time_new'].dt.month
# This to sum each category to find number of articles per week, and month etc
df['ones'] = 1

In [6]:
# Here we find types of publications
# Like Systematic review, Scoping review etc
print('Systematic review\t'+str(len(df[((df['abstract'].str.contains('systematic review|systematic literature review'))
                                        |(df['title'].str.contains('systematic review|systematic literature review')))])))

print('Meta-analysis\t'+str(len(df[((df['abstract'].str.contains('meta-analysis|metaanalysis'))
                                        |(df['title'].str.contains('meta-analysis|metaanalysis')))])))

print('Scoping review\t'+str(len(df[((df['abstract'].str.contains('scoping review|scoping literature review'))
                                        |(df['title'].str.contains('scoping review|scoping literature review')))])))

print('Randomised control trial\t'+str(len(df[((df['abstract'].str.contains('randomised control trial|randomized control trial|randomised controlled trial|randomized controlled trial|randomized clinical trial|randomised clinical trial'))
                                        |(df['title'].str.contains('randomised control trial|randomized control trial|randomised controlled trial|randomized controlled trial|randomized clinical trial|randomised clinical trial')))])))

print('Survey\t'+str(len(df[((df['abstract'].str.contains('survey'))
                                        |(df['title'].str.contains('survey')))])))

print('Case-control study\t'+str(len(df[((df['abstract'].str.contains('case-control study|case control study'))
                                        |(df['title'].str.contains('case-control study|case control study')))])))

print('Cohort study\t'+str(len(df[((df['abstract'].str.contains('cohort study'))
                                        |(df['title'].str.contains('cohort study')))])))

print('Case study\t'+str(len(df[((df['abstract'].str.contains('case study'))
                                        |(df['title'].str.contains('case study')))])))

Systematic review	733
Meta-analysis	480
Scoping review	79
Randomised control trial	362
Survey	1515
Case-control study	62
Cohort study	512
Case study	199


In [7]:
# Number of articles per day, week and month
time = df.groupby(['publish_time_new'])['ones'].describe()
time['sum'] = df.groupby(['publish_time_new'])['ones'].sum()
time.to_csv(data+'/results/daily_stats.csv')

week = df.groupby(['publish_time_new','week']).count().groupby(['week'])['ones'].describe()
week['sum'] = df.groupby(['publish_time_new','week']).count().groupby(['week'])['ones'].sum()
week.to_csv(data+'/results/weekly_stats.csv')

month = df.groupby(['publish_time_new','month']).count().groupby(['month'])['ones'].describe()
month['sum']= df.groupby(['publish_time_new','month']).count().groupby(['month'])['ones'].sum()
month.to_csv(data+'/results/monthly_stats.csv')

In [21]:
# from collections import Counter
# for i in range(20):
#     tf = df[df['cluster']==i]
#     label = 'cluster_'+str(i)
#     tf.groupby(['publish_time_new'])['ones'].count().to_csv(data+'/daily_numbers_'+label+'.csv')
#     tf.groupby(['week'])['ones'].count().to_csv(data+'/weekly_numbers_'+label+'.csv')
#     tf.groupby(['month'])['ones'].count().to_csv(data+'/monthly_numbers_'+label+'.csv')
#     tf.groupby(['publish_time_new'])['ones'].describe().to_csv(data+'/daily_stats_'+label+'.csv')
#     tf.groupby(['publish_time_new','week']).count().groupby(['week'])['index'].describe().to_csv(data+'/weekly_stats_'+label+'.csv')
#     tf.groupby(['publish_time_new','month']).count().groupby(['month'])['index'].describe().to_csv(data+'/monthly_stats_'+label+'.csv')

In [10]:
# Detail statistics of the publications
# Like weekly and monthly articles over all and per topic (cluster)
from collections import Counter
import copy
t,w,m=[],[],[]
for i in range(20):
    tf = df[df['cluster']==i]
    label = 'cluster_'+str(i)
#     tf.groupby(['publish_time_new'])['ones'].describe().to_csv(data+'/daily_stats_'+label+'.csv')
#     tf.groupby(['publish_time_new','week']).count().groupby(['week'])['index'].describe().to_csv(data+'/weekly_stats_'+label+'.csv')
#     tf.groupby(['publish_time_new','month']).count().groupby(['month'])['index'].describe().to_csv(data+'/monthly_stats_'+label+'.csv')
    
    
    time = tf.groupby(['publish_time_new'])['ones'].describe()
    time['sum'] = tf.groupby(['publish_time_new'])['ones'].sum()
    
    week = tf.groupby(['publish_time_new','week']).count().groupby(['week'])['ones'].describe()
    week['sum'] = tf.groupby(['publish_time_new','week']).count().groupby(['week'])['ones'].sum()
    
    month = tf.groupby(['publish_time_new','month']).count().groupby(['month'])['ones'].describe()
    month['sum']= tf.groupby(['publish_time_new','month']).count().groupby(['month'])['ones'].sum()
    t.append(copy.deepcopy(time.append(pd.Series(name=label))))
    w.append(copy.deepcopy(week.append(pd.Series(name=label))))
    m.append(copy.deepcopy(month.append(pd.Series(name=label))))

In [25]:
# Storing the stats 
# t.to_csv(data+'/results/time.csv')
tt = pd.concat(t)
ww = pd.concat(w)
mm = pd.concat(m)
tt.to_csv(data+'/results/daily_stats_all_cluster.csv')
ww.to_csv(data+'/results/weekly_stats_all_cluster.csv')
mm.to_csv(data+'/results/monthly_stats_all_cluster.csv')

In [12]:
def strdisp(y):
    st =''
    for x in y:
        st+=str(x)+'\n'
    return st

In [15]:
# Printing types of articles (liek Systematic review, Scoping review etc)
# Printing number of journals and top journals, over-all and per topic
# Printing number of authors and top authors, over-all and per topic 
text = '''Systematic review	733
Meta-analysis	480
Scoping review	79
Randomised control trial	362
Survey	1515
Case-control study	62
Cohort study	512
Case study	199\n\n\n\n'''
jour = df['journal'].tolist()
top_jour = Counter(jour)
auth = [x.strip() for st in df['authors'].tolist() for x in st.split(';')]
top_auth= Counter(auth)
text= text+'Number of journals:\t'+str(len(set(jour))-1)+'\n'
text= text+strdisp(top_jour.most_common(15))+'\n'
text= text+'Number of Authors:\t'+str(len(set(auth))-1)+'\n'
text= text+strdisp(top_auth.most_common(15))+'\n'
text=text+'**********************************\n'
for i in range(20):
    tf = df[df['cluster']==i]
    label = 'cluster_'+str(i)
    jour = tf['journal'].tolist()
    top_jour = Counter(jour)
    auth = [x.strip() for st in tf['authors'].tolist() for x in st.split(';')]
    top_auth= Counter(auth)
    text=text+label+'\n'
    text= text+'Number of journals:\t'+str(len(set(jour))-1)+'\n'
    text= text+strdisp(top_jour.most_common(15))+'\n'
    text= text+'Number of Authors:\t'+str(len(set(auth))-1)+'\n'
    text= text+strdisp(top_auth.most_common(15))+'\n'
    text=text+'**********************************\n'
print(text)

Systematic review	733
Meta-analysis	480
Scoping review	79
Randomised control trial	362
Survey	1515
Case-control study	62
Cohort study	512
Case study	199



Number of journals:	4034
('', 6286)
('bioRxiv', 1374)
('J Med Virol', 424)
('medRxiv', 215)
('Int J Environ Res Public Health', 175)
('Int J Infect Dis', 165)
('Clin Infect Dis', 160)
('Cureus', 148)
('Psychological trauma : theory, research, practice and policy', 138)
('Sci Total Environ', 131)
('medRxiv : the preprint server for health sciences', 124)
('J Eur Acad Dermatol Venereol', 121)
('Med Hypotheses', 120)
('Disaster medicine and public health preparedness', 93)
('Chaos Solitons Fractals', 91)

Number of Authors:	150684
('', 240)
('Wang, Wei', 59)
('Liu, Lei', 55)
('Zhang, Wei', 51)
('Wang, Jing', 42)
('Li, Yan', 41)
('Chen, Yu', 40)
('Wang, Ying', 40)
('Wang, X.', 38)
('Li, Wei', 37)
('Liu, Jun', 37)
('Zhang, Lei', 37)
('Liu, Y.', 37)
('Yuen, Kwok-Yung', 36)
('Chen, Wei', 36)

**********************************
cluster_0
Nu

In [17]:
# Storing the above printed data in the file as well
with open(data+"/results/Results.txt", "w") as text_file:
    text_file.write(text)

In [18]:
# Top journals
jour = df['journal'].tolist()
top_jour = Counter(jour)

In [26]:
# Top journals stored
top = pd.DataFrame(top_jour.most_common(4035))
top.columns=['Journal','count']
top.to_csv(data+'/results/Journals_List_sorted.csv',index=None)

In [8]:
# Loading data
df_a = pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered_final.csv').fillna('-999')

In [5]:
# Loading raw data
df_f = pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered.csv').fillna('')

In [10]:
df_a = df_a.drop(['index'], axis = 1) 
df_a.head()

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,pmcid,pubmed_id,publish_time_new,lang,clean_text,unigram,bigram,trigram,cluster,week,month,ones,country
0,2jq626ye,therapeutic strategies in an outbreak scenario...,10.12688/f1000research.22211.2,a novel coronavirus (2019-ncov) originating in...,2020-02-07,"Kruse, Robert L.",F1000Res,PMC7029759,32117569.0,2020-02-07,en,coronavirus ncov originating wuhan china prese...,"patients,genome,speed,breath,neutralizing,effo...","patients genome,genome speed,speed breath,brea...","patients genome speed,genome speed breath,spee...",3,6,2,1,United States
1,270msv5l,baricitinib - a januase kinase inhibitor - not...,10.1016/j.ijantimicag.2020.105967,• several studies suggested baricitinib as a p...,2020-04-04,"Praveen, D.; Chowdary, Puvvada Ranadheer; Aana...",Int J Antimicrob Agents,PMC7128600,32259575.0,2020-04-04,en,studies suggested baricitinib potential drug ...,"act,patients,epidemiological,neutropenia,close...","act patients,patients epidemiological,epidemio...","act patients epidemiological,patients epidemio...",17,14,4,1,India
2,542e5h1u,coronavirus disease: challenges for psychiatry,10.1192/bjp.2020.86,coronavirus disease (covid-19) presents two ur...,2020-04-15,"Kelly, Brendan D.",-999,PMC7205546,32293555.0,2020-04-15,en,coronavirus disease covid presents urgent heal...,"patients,good,illness,coronavirus,present,fami...","patients good,good illness,illness coronavirus...","patients good illness,good illness coronavirus...",16,16,4,1,Ireland
3,72s8wpla,practical considerations when performing neuro...,10.1080/21646821.2020.1756132,"the coronavirus disease 2019, sars-cov-2 (the ...",2020-05-06,"Haines, Seline; Caccamo, Amy; Chan, Fonda; Gal...",Neurodiagn J,PMC7212538,31967948.0,2020-05-06,en,coronavirus disease sars cov covid led worldwi...,"patients,droplet,procedure,analysis,procedures...","patients droplet,droplet procedure,procedure a...","patients droplet procedure,droplet procedure a...",11,19,5,1,United States
4,yv4x8viu,extracorporeal life support organization coron...,10.1097/mat.0000000000001193,the extracorporeal life support organization (...,2020-05-12,"Shekar, Kiran; Badulak, Jenelle; Peek, Giles; ...",ASAIO J,PMC7228451,32358233.0,2020-05-12,en,extracorporeal life support organization elso ...,"context,regularly,practice,developed,oxygenati...","context regularly,regularly practice,practice ...","context regularly practice,regularly practice ...",11,20,5,1,Australia


In [15]:
# Merging two data files to find articles UIDs and the associated files
# We did not store these information previously, so we are doing it here
df = pd.merge(df_a, df_csv[['cord_uid','pdf_json_files']], on='cord_uid').fillna('-999')
df.head()

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,pmcid,pubmed_id,publish_time_new,...,clean_text,unigram,bigram,trigram,cluster,week,month,ones,country,pdf_json_files
0,2jq626ye,therapeutic strategies in an outbreak scenario...,10.12688/f1000research.22211.2,a novel coronavirus (2019-ncov) originating in...,2020-02-07,"Kruse, Robert L.",F1000Res,PMC7029759,32117569.0,2020-02-07,...,coronavirus ncov originating wuhan china prese...,"patients,genome,speed,breath,neutralizing,effo...","patients genome,genome speed,speed breath,brea...","patients genome speed,genome speed breath,spee...",3,6,2,1,United States,document_parses/pdf_json/5a17ed3e4abf295f5820c...
1,270msv5l,baricitinib - a januase kinase inhibitor - not...,10.1016/j.ijantimicag.2020.105967,• several studies suggested baricitinib as a p...,2020-04-04,"Praveen, D.; Chowdary, Puvvada Ranadheer; Aana...",Int J Antimicrob Agents,PMC7128600,32259575.0,2020-04-04,...,studies suggested baricitinib potential drug ...,"act,patients,epidemiological,neutropenia,close...","act patients,patients epidemiological,epidemio...","act patients epidemiological,patients epidemio...",17,14,4,1,India,document_parses/pdf_json/04383994da0f349f314e5...
2,542e5h1u,coronavirus disease: challenges for psychiatry,10.1192/bjp.2020.86,coronavirus disease (covid-19) presents two ur...,2020-04-15,"Kelly, Brendan D.",-999,PMC7205546,32293555.0,2020-04-15,...,coronavirus disease covid presents urgent heal...,"patients,good,illness,coronavirus,present,fami...","patients good,good illness,illness coronavirus...","patients good illness,good illness coronavirus...",16,16,4,1,Ireland,document_parses/pdf_json/322478c3c7cc2498a9285...
3,72s8wpla,practical considerations when performing neuro...,10.1080/21646821.2020.1756132,"the coronavirus disease 2019, sars-cov-2 (the ...",2020-05-06,"Haines, Seline; Caccamo, Amy; Chan, Fonda; Gal...",Neurodiagn J,PMC7212538,31967948.0,2020-05-06,...,coronavirus disease sars cov covid led worldwi...,"patients,droplet,procedure,analysis,procedures...","patients droplet,droplet procedure,procedure a...","patients droplet procedure,droplet procedure a...",11,19,5,1,United States,document_parses/pdf_json/d049cc13cfc0854eef691...
4,72s8wpla,practical considerations when performing neuro...,10.1080/21646821.2020.1756132,"the coronavirus disease 2019, sars-cov-2 (the ...",2020-05-06,"Haines, Seline; Caccamo, Amy; Chan, Fonda; Gal...",Neurodiagn J,PMC7212538,31967948.0,2020-05-06,...,coronavirus disease sars cov covid led worldwi...,"patients,droplet,procedure,analysis,procedures...","patients droplet,droplet procedure,procedure a...","patients droplet procedure,droplet procedure a...",11,19,5,1,United States,-999


In [68]:
# Disambiguating and finding country of publication using author affiliation. 
# We used first author affiliation (if available) as country of publication
# If country cannot be found using first author affiliation, 
# we use next author until we find right country from the list of all countries 
import pycountry
pc = list(pycountry.countries)
def ret_cout(con, file):
    if con != '-999':
        return con
    if file == '-999':
        return '-999'
    for f in file.split(';'):
        jsn = json.load(open(data+'/'+f.strip(), 'r'))
        files.append(jsn)
        j = jsn['metadata']['authors']
        for c in range(len(j)):
            af = str(j[c]['affiliation'])
            for cnty in pc:
                if cnty.name in af or cnty.alpha_3 in af or cnty.alpha_2 in af:
                    return cnty.name
                if (hasattr(cnty, "common_name") and cnty.common_name in af) or (hasattr(cnty, "official_name") and cnty.official_name in af):
                    return cnty.name
    return '-999'
df['country'] = df.apply(lambda x: ret_cout(x.country, x.pdf_json_files), axis=1)                 

In [71]:
# The country is added back to the publication record and stored
df.to_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered1.csv',index=None)

In [59]:
# Due to marging the duplicates were introduced again
# So we are removing them here again
df= pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered1.csv')
has_dup = df.duplicated(subset ="title", keep=False)
print(len(df))
dup = df[has_dup]
df = df[~has_dup]
len(dup),len(df)
dup = dup.fillna('-999')
dup1 = dup[~((dup['journal'].str.contains('-999'))|((dup['pmcid'].str.contains('-999')))
               |((dup['pubmed_id'].str.contains('-999')))|((dup['doi'].str.contains('-999'))))]
len(dup1), len(dup)
dup1 = dup1.drop_duplicates(subset ="title", keep='first')
len(dup1)
df1 = pd.concat([df,dup1])
len(df1),len(df),len(dup1)
df.to_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered1.csv',index=None)

28093


In [60]:
# Finding top countries
from collections import Counter
jour = df['country'].tolist()
top_jour = Counter(jour)
top_jour.most_common(len(top_jour))

[('-999', 11633),
 ('China', 2950),
 ('United States', 1357),
 ('Italy', 1157),
 ('Saudi Arabia', 978),
 ('India', 854),
 ('Canada', 671),
 ('United Kingdom', 525),
 ('Germany', 449),
 ('Australia', 403),
 ('France', 383),
 ('Spain', 360),
 ('Brazil', 349),
 ('Morocco', 259),
 ('Switzerland', 233),
 ('Turkey', 218),
 ('Israel', 217),
 ('Japan', 214),
 ('Singapore', 169),
 ('Netherlands', 165),
 ('Moldova, Republic of', 154),
 ('Belgium', 146),
 ('Cocos (Keeling) Islands', 142),
 ('Panama', 118),
 ('Argentina', 108),
 ('Hong Kong', 106),
 ('Greece', 106),
 ('Iran', 103),
 ('Pakistan', 100),
 ('Ireland', 97),
 ('Poland', 94),
 ('Colombia', 93),
 ('American Samoa', 89),
 ('Austria', 84),
 ('Bangladesh', 82),
 ('Sweden', 80),
 ('Denmark', 79),
 ('Egypt', 78),
 ('Mexico', 72),
 ('South Africa', 69),
 ('Taiwan, Province of China', 69),
 ('Chile', 68),
 ('Gabon', 62),
 ('New Caledonia', 62),
 ('United Arab Emirates', 54),
 ('Malaysia', 53),
 ('South Korea', 53),
 ('Armenia', 52),
 ('Eritrea',

In [4]:
# Finding top authors
df = pd.read_csv(data+'/Selected_articles_clean_text_eng_duplicate_removed_clustered1.csv').fillna('')
len(df)
from collections import Counter
auth = [x.strip() for st in df['authors'].tolist() for x in st.split(';')]
auth1 = Counter(auth)
top_authors = auth1.most_common(100)
top_authors

[('-999', 228),
 ('Wang, Wei', 59),
 ('Liu, Lei', 54),
 ('Zhang, Wei', 48),
 ('Wang, Jing', 42),
 ('Chen, Yu', 39),
 ('Wang, Ying', 39),
 ('Li, Yan', 39),
 ('Wang, X.', 38),
 ('Liu, Jun', 37),
 ('Liu, Y.', 37),
 ('Li, Wei', 36),
 ('Chen, Wei', 36),
 ('Zhang, Lei', 36),
 ('Wang, J.', 36),
 ('Zhang, Y.', 36),
 ('Yuen, Kwok-Yung', 34),
 ('Liu, Jing', 34),
 ('Wang, Y.', 34),
 ('Yang, Yang', 33),
 ('Zhang, X.', 33),
 ('Li, Y.', 33),
 ('Liu, Wei', 32),
 ('Wang, Hui', 32),
 ('Wang, Jian', 32),
 ('Zhang, Yan', 31),
 ('Zhang, Li', 31),
 ('Drosten, Christian', 31),
 ('Li, J.', 31),
 ('Wang, Lin', 30),
 ('Wang, Tao', 30),
 ('Chowell, Gerardo', 30),
 ('Liu, Yang', 29),
 ('Lippi, Giuseppe', 29),
 ('Zhang, Zheng', 29),
 ('Wang, Yan', 28),
 ('Li, Jing', 28),
 ('Chen, J.', 28),
 ('Chen, Y.', 28),
 ('Li, Li', 27),
 ('Chan, Jasper Fuk-Woo', 27),
 ('Baric, Ralph S.', 27),
 ('Li, Hui', 26),
 ('Lu, Hongzhou', 26),
 ('Zhang, Jun', 26),
 ('Liu, Ying', 26),
 ('Liu, Fang', 25),
 ('To, Kelvin Kai-Wang', 25),
 (

In [5]:
# Varifying top authors
def t_au(lst, au):
    x = lst.split(';')
    for a in x:
        if a.strip() in au:
            return True
    return False
len(df[df.apply(lambda x: t_au(x['authors'],'Wang, Wei'), axis=1)])

59

In [6]:
# Top authors found and recorded in file
st = []
for k, v in top_authors:
    if k!='-999':
        au = df[df.apply(lambda x: t_au(x['authors'],k), axis=1)]
        au = au[['title', 'authors','journal']]
        au['author'] = k
        st.append(au)
a = pd.concat(st)
a.to_csv(data+'/results/authors_m.csv')

In [7]:
# Top authors per topi found and recorded in the file
from collections import Counter
aall = []
for i in range(20):
    dff = df[df['cluster']==i]
    auth = [x.strip() for st in dff['authors'].tolist() for x in st.split(';')]
    auth1 = Counter(auth)
    top_authors = auth1.most_common(100)
    st = []
    for k, v in top_authors:
        if k!='-999':
            au = dff[dff.apply(lambda x: t_au(x['authors'],k), axis=1)]
            au = au[['title', 'authors','journal','cluster']]
            au['author'] = k
            st.append(au)
    a = pd.concat(st)
    aall.append(a)
au_all = pd.concat(aall)
au_all.to_csv(data+'/results/authors_per_cluster_m.csv')