In [None]:
import numpy as np
import json
import os
import string
from os import listdir
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation as LDA
from scipy import sparse
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
import gensim

In [None]:
# Data preprocessing - run only once

comm_files = listdir('comm_use_subset/comm_use_subset/')
noncomm_files = listdir('noncomm_use_subset/noncomm_use_subset/')
biorxiv_medrxiv_files = listdir('biorxiv_medrxiv/biorxiv_medrxiv/')
custom_license_files = listdir('custom_license/custom_license/')
comm_basePath = '~/Data/CORD-19-research-challenge/comm_use_subset/comm_use_subset'
noncomm_basePath = '~/Data/CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset'
biomed_basePath = '~/Data/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv'
custom_basePath = '~/Data/CORD-19-research-challenge/custom_license/custom_license/'

print(len(comm_files))
print(len(noncomm_files))
print(len(biorxiv_medrxiv_files))
print(len(custom_license_files))

def preprocessJSON(json_infile, basepath):
    json_input = json.load(open(basepath + '/' + json_infile))
    json_body = pd.json_normalize(data=json_input,
                                 record_path='body_text',
                                 meta=[['metadata','title']])
    text_array = json_body.iloc[:,0].values
    text_df = pd.DataFrame({'main text':[' '.join(text_array)]},
                           index=[json_body.iloc[0,4]])
    return text_df

all_comm_main_text = [preprocessJSON(f, comm_basePath) for f in comm_files]
comm_main_df = pd.concat(all_comm_main_text)
comm_main_df.to_csv('comm_used_main_df.csv')

all_noncomm_main_text = [preprocessJSON(f, noncomm_basePath) for f in noncomm_files]
noncomm_main_df = pd.concat(all_noncomm_main_text)
noncomm_main_df.to_csv('noncomm_used_main_df.csv')

all_biomed_main_text = [preprocessJSON(f, biomed_basePath) for f in biorxiv_medrxiv_files]
biomed_main_df = pd.concat(all_biomed_main_text)
biomed_main_df.to_csv('biomed_main_df.csv')

all_custom_main_text = [preprocessJSON(f, custom_basePath) for f in custom_license_files]
custom_main_df = pd.concat(all_custom_main_text)
custom_main_df.to_csv('custom_main_df.csv')

We first read in pre-processed csv files containing each paper's title and the corresponding main text.

In [2]:
# Read in pre-processed csv files containing each paper's title and the corresponding main text.
wkdir = '~/Data/CORD-19-research-challenge/'
comm_used_corpus = pd.read_csv(wkdir + 'comm_used_main_df.csv', index_col=0)
noncomm_used_corpus = pd.read_csv(wkdir + 'noncomm_used_main_df.csv', index_col=0)
biomed_corpus = pd.read_csv(wkdir + 'biomed_main_df.csv', index_col=0)
custom_corpus = pd.read_csv(wkdir + 'custom_main_df.csv', index_col=0)
all_corpus = pd.concat([comm_used_corpus,
                        noncomm_used_corpus,
                        biomed_corpus,
                        custom_corpus])

all_corpus.isnull().values.any()
title = all_corpus.index.to_list()
title = [str(i) if title[i] != title[i] else title[i] for i in np.arange(len(title))]
all_corpus.index = title
print(all_corpus.shape)

(33375, 1)


There are a total of 33,375 papers in the CORD-19 dataset as of March 30, 2020.

In [3]:
# Helper function
def regex_func(text):
    if text:
        res = re.search('COVID | coronavirus | Coronavirus | SARS-Cov | SARS-nCov',text)
        return res
    else:
        return False

# Here we use a Porter stemmer to stem suffices from words 
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

We want to look at only publications that mention COVID/Coronavirus/SARS-Cov-2

In [4]:
regex_expr = '\\b(?:COVID|coronavirus|Coronavirus|SARS-Cov|SARS-nCov|sars|corona)'
all_corpus = all_corpus[all_corpus.index.str.contains(regex_expr) & all_corpus.loc[:,'main text'].str.contains(regex_expr)]

In [5]:
all_corpus

Unnamed: 0,main text
"Complete Genome Sequence of a Novel Swine Acute Diarrhea Syndrome Coronavirus, CH/FJWT/2018, Isolated in Fujian, China, in 2018",Alphacoronavirus in the family Coronaviridae a...
Prevalence of Korean cats with natural feline coronavirus infections,"Feline coronavirus (FCoV) is an enveloped, pos..."
Polymorphisms in the feline TNFA and CD209 genes are associated with the outcome of feline coronavirus infection,"Feline infectious peritonitis (FIP), a highly ..."
Novel Coronavirus and Astrovirus in Delaware Bay Shorebirds,Wild birds have been recognized as important r...
Characterization of an Immunodominant Epitope in the Endodomain of the Coronavirus Membrane Protein,Coronaviruses (CoVs) are clustered in the Coro...
...,...
Coronavirus in severe acute respiratory syndrome (SARS) Severe acute respiratory syndrome: identification of the etiological agent,Severe acute respiratory syndrome (SARS) is ca...
Relapsing Subacute Demyelina~:ing Encephalomyelitis in Rats during the Course of Coronavirus JHM Infection,pathogenesis are unknown. The most prominent e...
Middle East Respiratory Syndrome Coronavirus Transmission,M iddle East respiratory syndrome (MERS) coron...
Single particle assay of coronavirus membrane fusion with proteinaceous receptor-embedded supported bilayers,Coronaviruses are membrane-enveloped viruses t...


As seen above, after filtering for only literature containing COVID-19 related keywords in the main text, we are looking at a total of 3,036 papers. Next we tokenize the filtered text using the Porter stemmer and remove words that are too short or containing non-word characters. We end up with a bag of words representation of the corpus.

In [7]:
def processText(text):
    lower_text = text.lower()
    nonpunc_text = lower_text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens,stemmer)
    # Remove words containing non-word characters
    return [word for word in tokens if re.match(r'[^\W\d]*$', word) and len(word) >= 2]

# corpus_list = [processText(all_corpus.iloc[i,0]) for i in range(int(all_corpus.count()))]

fn = 'COVID_corpus_list.txt'
### Run this code block only once and comment out the ###
### line generating corpus_list afterwards ##############
# f = open(fn,'w')
# for l in corpus_list:
#     f.write(' '.join(l) + '\n')
# f.close()
#########################################################
f = open(fn,'r')
corpus_list = f.readlines()
corpus_list = [l.strip().split() for l in corpus_list]
f.close()

We use the gensim package in Python to create a dictionary of the COVID-19 literature subset, and filter out terms that occur either too infrequently ('no_below') or too often ('no_above'). The filtered dictionary is then converted to gensim's bag of words object.

In [8]:
corpus_dict = gensim.corpora.Dictionary(corpus_list)

In [9]:
corpus_dict.filter_extremes(no_below=15, no_above=0.55, keep_n=100000)
corpus_bow = [corpus_dict.doc2bow(doc) for doc in corpus_list]

We first inspect the major sub-topics that emerge from COVID-19-related literature by a latent dirichlet allocation (LDA) model. The only parameter we provide the model is

- number of topics = 20

In [10]:
corpus_lda = gensim.models.LdaMulticore(corpus_bow, 
                                        num_topics=20, 
                                        id2word=corpus_dict,
                                        passes=4,
                                        workers=4)

In [11]:
# Save the model ot disk for loading later
from gensim.test.utils import datapath
temp_file = datapath("model")
corpus_lda.save(temp_file)

From a preliminary view of the 20 'topics' that emerged from the LDA model, we can begin to make sense of a few themes, including mechanisms of infection, the determination of the genetic sequence of the novel coronavirus, production of vaccines, structure and function of viral components, animal origin/transmission of the virus etc.. 

In [12]:
for idx, topic in corpus_lda.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"camels" + 0.006*"transmission" + 0.005*"outbreak" + 0.005*"health" + 0.005*"bats" + 0.004*"epidemic" + 0.004*"bat" + 0.004*"countries" + 0.004*"China" + 0.003*"contact"
Topic: 1 
Words: 0.015*"patients" + 0.006*"patient" + 0.006*"transmission" + 0.005*"clinical" + 0.005*"SARS" + 0.004*"days" + 0.004*"should" + 0.004*"symptoms" + 0.004*"case" + 0.004*"detection"
Topic: 2 
Words: 0.008*"mice" + 0.007*"CNS" + 0.005*"days" + 0.004*"brain" + 0.004*"demyelination" + 0.004*"type" + 0.004*"al" + 0.003*"rats" + 0.003*"strain" + 0.003*"MHV"
Topic: 3 
Words: 0.012*"mice" + 0.007*"expression" + 0.005*"lung" + 0.005*"immune" + 0.005*"replication" + 0.005*"response" + 0.004*"Figure" + 0.004*"activity" + 0.004*"host" + 0.003*"SARS"
Topic: 4 
Words: 0.012*"cats" + 0.009*"FIP" + 0.005*"al" + 0.005*"SARS" + 0.005*"patients" + 0.005*"FCoV" + 0.004*"transmission" + 0.003*"coronaviruses" + 0.003*"antibody" + 0.003*"clinical"
Topic: 5 
Words: 0.007*"proteins" + 0.007*"al" + 0.006*"IB

To assess how well the LDA model performed, we calculate two metrics: the 'perplexity' of the model, where a more negative value indicates increased complexity (and thus arguably better); and a 'coherence' score, which looks at how similar high-scoring words in each given topic are to each other. Here, higher scores indicate better models. Our current version of the model has a coherence score of 0.45-0.5 (depending on random seed used), corresponding to a medium level of coherence - this is amenable to improvement. However, since this section is only a preliminary exploration of the dataset, we reserve further parameter tweaking for future analyses.

In [13]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', corpus_lda.log_perplexity(corpus_bow))

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=corpus_lda, 
                                     texts=corpus_list, 
                                     dictionary=corpus_dict, 
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.989901154955004

Coherence Score:  0.4809912564891837


Now we utilize the pyLDAvis toolkit to effectively visualize topic distribution and the relative contribution of words in an interactive setting.

In [14]:
pyLDAvis.enable_notebook()
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(corpus_lda, corpus_bow, corpus_dict)
vis

Next we use cosine similarity and term frequency-inverse document frequency (TF-IDF) to identify documents that are closest matches to our subtask topics.

In brief, we read in the sub-task prompts (the knowledge topics we aim to discover from the CORD-19 dataset) and process them the same way as we did for generating the TF-IDF representation of the corpus. The vectors corresponding to individual documents (search topic or articles) can then be compared for similarity.

In [None]:
corpus_dict = {}
for i in range(int(all_corpus.count())):
    curr_text = all_corpus.iloc[i,0]
    lower_text = curr_text.lower()
    nonpunc_text = lower_text.translate(str.maketrans('', '', string.punctuation))
    # Remove words containing non-word characters and short words
    split_text = nonpunc_text.split()
    cleaned_text = ' '.join([word for word in split_text if re.match(r'[^\W\d]*$', word) and len(word) >= 2])
    corpus_dict[title[i]] = cleaned_text
    
def tokenizer(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
    
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english')
tfs = tfidf.fit_transform(corpus_dict.values())

In [17]:
# Read in the queries
in_fn = 'searchTerms.txt'
f = open(in_fn,'r')
queries = f.readlines()
f.close()
queries = [query.strip() for query in queries]

query_dict = {}
for i in range(len(queries)):
    curr_text = queries[i]
    lower_text = curr_text.lower()
    nonpunc_text = lower_text.translate(str.maketrans('', '', string.punctuation))
    # Remove words containing non-word characters and short words
    split_text = nonpunc_text.split()
    cleaned_text = ' '.join([word for word in split_text if re.match(r'[^\W\d]*$', word) and len(word) >= 2])
    query_dict[i] = cleaned_text

query_tfs = tfidf.transform(query_dict.values())

In [18]:
query_tfs.shape

(9, 84631)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = [cosine_similarity(qtf, tfs).flatten() for qtf in query_tfs]

Here we display the titles of the 3 most similar document (article) in our filtered corpus to each search term.

In [None]:
top_article_indices = np.argsort(cos_sim,axis=1)
top_article_indices = [np.flip(tai) for tai in top_article_indices]
count = 0
for idx in top_article_indices:
    print(queries[count].strip(".")+":\n")
    for i in range(5):
        print(str(i+1) + '. ' + all_corpus.index[idx[i]] + '\n')
    print('===============================================================================\n')
    count += 1

Real-time tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time:

1. Trafficking motifs in the SARS-coronavirus nucleocapsid protein

2. A mathematical model for the spatiotemporal epidemic spreading of COVID19

3. Avian coronavirus infectious bronchitis virus susceptibility to botanical oleoresins and essential oils in vitro and in vivo

4. Epidemiology and Infection SARS to novel coronavirus -old lessons and new lessons

5. Potential Rapid Diagnostics, Vaccine and Therapeutics for 2019 Novel Coronavirus (2019-nCoV): A Systematic Review

===============================================================================

Access to geographic and temporal diverse sample sets to understand geographic distribution and genomic differences, and determine whether there is more than one strain in circulation. Multi-lateral agreements such as the Nagoya Protocol could be leveraged:

1. Identification of a 24-kDa Polypeptide Processed from the Coronavirus Infectious Bronchitis Virus 1a Polyprotein by the 3C-like Proteinase and Determination of Its Cleavage Sites

2. Evolution of the novel coronavirus from the ongoing Wuhan outbreak and modeling of its spike protein for risk of human transmission Citation

3. Prevalence and genetic diversity analysis of human coronaviruses among cross-border children

4. Detection of distinct MERS-Coronavirus strains in dromedary camels from Kenya, 2017

5. A framework for identifying regional outbreak and spread of COVID-19 from one-minute population-wide surveys

===============================================================================

Evidence that livestock could be infected (e.g., field surveillance, genetic sequencing, receptor binding) and serve as a reservoir after the epidemic appears to be over:

1. Tropical Medicine and Infectious Disease Potential Intermediate Hosts for Coronavirus Transmission: No Evidence of Clade 2c Coronaviruses in Domestic Livestock from Ghana

2. GENE The major subunit ClpG of Escherichia coli CS31A fibrillae as an expression vector for different combinations of two TGEV coronavirus epitopes

3. Middle East respiratory syndrome coronavirus infection in non-camelid domestic mammals

4. Surveillance of the Middle East respiratory syndrome (MERS) coronavirus (CoV) infection in healthcare workers after contact with confirmed MERS patients: incidence and risk factors of MERS-CoV seropositivity

5. BMC Microbiology Adaptive evolution of the spike gene of SARS coronavirus: changes in positively selected sites in different epidemic groups

===============================================================================

Evidence of whether farmers are infected, and whether farmers could have played a role in the origin:

1. Open Access RESEARCH The relationship between antibody status to bovine corona virus and bovine respiratory syncytial virus and disease incidence, reproduction and herd characteristics in dairy herds

2. Bovine respiratory syncytial virus and bovine coronavirus in Swedish organic and conventional dairy herds

3. Overview of Feline Coronavirus Infections

4. Nsp3 of coronaviruses: Structures and functions of a large multi-domain protein

5. Journal Pre-proof Inside China and COVID-19: Questions and answers

===============================================================================

Surveillance of mixed wildlife- livestock farms for SARS-CoV-2 and other coronaviruses in Southeast Asia:

1. Tropical Medicine and Infectious Disease Potential Intermediate Hosts for Coronavirus Transmission: No Evidence of Clade 2c Coronaviruses in Domestic Livestock from Ghana

2. Structural insights into coronavirus entry

3. A fast and simple one-step duplex PCR assay for canine distemper virus (CDV) and canine coronavirus (CCoV) detection

4. Identification of Putative Polymerase Gene Product in Cells Infected with Murine Coronavirus A59

5. COVID-19) infection

===============================================================================

Experimental infections to test host range for this pathogen:

1. Host resilience to emerging coronaviruses

2. The SARS-Coronavirus papain-like protease: Structure, function and inhibition by designed antiviral compounds HHS Public Access

3. Vignette for V13N1 issue Importance of blood cellular genomic profile in coronary heart disease

4. Effectiveness of an education health programme about Middle East respiratory syndrome coronavirus tested during travel consultations Public Health

5. Recent Progress in Studies of Arterivirus-and Coronavirus-Host Interactions

===============================================================================

Animal host(s) and any evidence of continued spill-over to humans:

1. Geographical structure of bat SARS-related coronaviruses

2. viruses Bats and Coronaviruses

3. IL-22 suppresses the infection of porcine enteric coronaviruses and rotavirus by activating STAT3 signal pathway

4. Clinical Medicine Editorial Initial Cluster of Novel Coronavirus (2019-nCoV) Infections in Wuhan, China Is Consistent with Substantial Human-to-Human Transmission

5. From SARS to COVID-19: A previously unknown SARS-related coronavirus (SARS-CoV-2) of pandemic potential infecting humans -Call for a One Health approach

===============================================================================

Socioeconomic and behavioral risk factors for this spill-over:

1. Carbohydrate-induced conformational changes strongly modulate the antigenicity of coronavirus TGEV glycoproteins S and M

2. Geographical structure of bat SARS-related coronaviruses

3. Clinical features of imported cases of coronavirus disease 2019 in Tibetan patients in the Plateau area

4. Epidemiological and clinical features of COVID-19 patients with and without pneumonia in Beijing, China

5. Epidemiology and Infection Passengers' destinations from China: low risk of Novel Coronavirus (2019-nCoV) transmission into Africa and South America

===============================================================================

Sustainable risk reduction strategies:

1. What needs to be done to control the spread of Middle East respiratory syndrome coronavirus?

2. Epidemiology and Infection Passengers' destinations from China: low risk of Novel Coronavirus (2019-nCoV) transmission into Africa and South America

3. Epidemiology and Infection Passengers' destinations from China: low risk of Novel Coronavirus (2019-nCoV) transmission into Africa and South America

4. Epidemiological and clinical features of COVID-19 patients with and without pneumonia in Beijing, China

5. Host susceptibility to severe COVID-19 and establishment of a host risk score: findings of 487 cases outside Wuhan

===============================================================================

As we can see above, most of the search topics were able to retrieve documents that are highly relevant to the question posed, for example article 2 for the first topic (inform the development of diagnostics and therapeutics and to track variations of the virus over time), articles 2,3 and 5 for the second topic (geographic distribution and genomic differences), and articles 1 and 3 for topic 5 (Surveillance of mixed wildlife-livestock farms).

Due to time limitations, we have not yet explored other metrics of similarity (e.g. simple term matching similarity / enforcing association rules / cosine similarity on lower-dimensional representation of the TF-IDF matrix / use other supervised methods for topic classification etc.) - we may be able to retrieve more relevant information on these topics with future improvements on the model/metrics used as well as continuous accumulation of new literature.