# Scientific Article Literature Clustering by NLP

Oscar Charles 2021

A natural language processing notebook, for use with abstract queries from https://europepmc.org/

## First Get some Data
Within europmc use the search term 

generic - "((antiviral) AND (resistance)) AND (mutation)" , optionally filter by year 

drug_specific - "((resistance)) AND ((mutation)) AND ((ribavirin) OR (favipiravir) OR (remdesivir) OR (EIDD-2801) OR (molnupiravir))
"
Then once results show go to / export citations / XML (abstracts)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xml.etree.ElementTree as ET
#! python -m spacy download en_core_web_sm
#! pip install bokeh

In [2]:
# parse xml
dict_ = {'doi': [], 'title': [], 'abstract':[]}

tree = ET.parse('europepmc_drug_specific_2555.xml') # download from a search in epmc
root = tree.getroot()

for entry in root.iter('result'):
    if entry.find('doi') is None:
        continue
    else:
        doi = entry.find('doi').text
    
    if entry.find('title') is None:
        continue
    else:
        title = entry.find('title').text
        
    if entry.find('abstractText') is None:
        continue
    else:
        abstractText = entry.find('abstractText').text

    #print(doi, title)
    dict_['doi'].append(doi)
    dict_['title'].append(title)
    dict_['abstract'].append(abstractText)


In [3]:
# format the dict to a table
df = pd.DataFrame(dict_, columns=['doi', 'title','abstract'])
df.head()

Unnamed: 0,doi,title,abstract
0,10.1016/j.bioorg.2021.105574,Virtual screening and in vitro validation of n...,The COVID-19 pandemic caused by the SARS-CoV-2...
1,10.3390/v13122535,Influence of Ribavirin on Mumps Virus Populati...,Frequent mumps outbreaks in vaccinated populat...
2,10.3390/v13122508,Enhancing the Antiviral Potency of Nucleobases...,Broad-spectrum antiviral therapies hold promis...
3,10.3390/ph14121307,Probing In Silico the Benzimidazole Privileged...,Targeting the fusion (F) protein has been reco...
4,10.1134/s0026893321040105,Antiviral and Antimicrobial Nucleoside Derivat...,The emergence of new viruses and resistant str...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1922 entries, 0 to 1921
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doi       1922 non-null   object
 1   title     1922 non-null   object
 2   abstract  1922 non-null   object
dtypes: object(3)
memory usage: 45.2+ KB


In [5]:
len(df)

1922

## Parse

In [6]:
#NLP 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [7]:
import string

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[:10]

['or',
 'therefore',
 'whether',
 'other',
 '‘m',
 'however',
 'using',
 'everything',
 'by',
 'third']

In [8]:
# stop words are words that are removed, we want to append the virus as thats providing most clustering signal.
custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www',
    "Hepatitis C", "Herpes", "Simplex 1", "Simplex 2", "Cytomegalovirus", "Coronavirus", "HCMV", "CMV", "HSV1", "HSV2", 
    "influenza A", "(H1N1)", "HIV-1", "HIV", "Hepatitis B", "HCV", "HBV",
    "SARS-COV-2", "COVID-19", "COVID", "human immunodeficiency virus", "HIV-RNA", "subtype", "genotype", "subtype",
    "avian influenza", "syncytial", # previous up to here
    "polio", "poliovirus", "chikungunya"
    
]


for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)

In [9]:
# Parser
parser = spacy.load("en_core_web_sm")
parser.max_length = 7000000

def call_tokenizer(df):
    df["processed_abstract"] = df["abstract"].apply(spacy_tokenizer)
    return df

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [10]:
from multiprocessing import  Pool

def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [11]:
df["processed_abstract"] = df["abstract"].apply(spacy_tokenizer)

In [12]:
# this hangs and we don't need as only low thousands
#%time df = parallelize_dataframe(df=df, func=call_tokenizer, n_cores=4)

### Save

In [13]:
import pickle

pickle.dump(df, open("epmc.p", "wb" ))

## Vectorize

In [14]:
import pickle

df = pickle.load(open("epmc.p", "rb"))
#df = df.sample(600000)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorize(text, maxx_features):
    
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X

In [16]:
text = df['processed_abstract'].values
%time X = vectorize(text, 2 ** 12)
X.shape

Wall time: 331 ms


(1922, 4096)

## Cluster

In [17]:
from sklearn.cluster import KMeans

In [18]:
k = 20
kmeans = KMeans(n_clusters=k, random_state=42)
%time y_pred = kmeans.fit_predict(X)
df['y'] = y_pred

Wall time: 14.4 s


In [19]:
pickle.dump(df, open("arxiv.p", "wb" ))

## Libraries for Plotting

In [20]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

## TSNE GPU

First reduce noise using PCA by extracting the first 20 principle components:

In [21]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20, random_state=42)
%time X_embedded_pca= pca.fit_transform(X.toarray())
X_embedded_pca.shape

Wall time: 439 ms


(1922, 20)

In [22]:
from sklearn.manifold import TSNE
X_embedded_tsne = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(X_embedded_pca)
df['x_tsne'] = X_embedded_tsne[:,0] 
df['y_tsne'] = X_embedded_tsne[:,1]

#from tsnecuda import TSNE
#%time X_embedded_tsne = TSNE(n_components=2, learning_rate=30, n_iter=500000, verbose=1).fit_transform(X_embedded_pca)

In [23]:
# we want bokeh to treat y (cluster) as discrete not continuous 
df['y'] = df['y'].apply(str)
df.dtypes

doi                    object
title                  object
abstract               object
processed_abstract     object
y                      object
x_tsne                float32
y_tsne                float32
dtype: object

In [24]:
# identify which papers we know are of interest, then make them squares. maybe we need to hide the virus name from the model?
key_doi=["10.1371/journal.ppat.1009929",
"10.1128/mBio.00221-18",
"10.1128/JVI.01965-17",
"10.1073/pnas.1811345115",
"10.1093/jac/dku209",
"10.1128/AAC.01073-16",
"10.1073/pnas.1232294100",
"10.1128/JVI.02139-12",
"10.1016/j.antiviral.2013.07.008",
"10.1371/journal.ppat.1001163",
"10.1128/JVI.00289-14",
"10.1073/pnas.1111650108",
"10.1128/JVI.79.4.2346-2355.2005",
"10.1128/JVI.03594-13",
"10.1128/JVI.01297-08",
"10.1371/journal.ppat.1003877"]
df['key'] = df['doi'].isin(key_doi)
df[df['doi'].isin(key_doi)]
#df[df['doi'].str.contains("10.1371/journal.ppat.1009929")]

Unnamed: 0,doi,title,abstract,processed_abstract,y,x_tsne,y_tsne,key
75,10.1371/journal.ppat.1009929,In vitro selection of Remdesivir resistance su...,"Remdesivir (RDV), a broadly acting nucleoside ...",remdesivir rdv broadly act nucleoside analogue...,2,-39.124496,6.267173,True
583,10.1073/pnas.1811345115,The mechanism of resistance to favipiravir in ...,Favipiravir is a broad-spectrum antiviral that...,favipiravir broad spectrum antiviral promise t...,19,-16.171354,-35.878323,True
1143,10.1093/jac/dku209,Mutations in the chikungunya virus non-structu...,"<h4>Objectives</h4>T-705, also known as favipi...",h4 objectives</h4 t-705 know favipiravir small...,4,-8.024849,26.06115,True
1197,10.1371/journal.ppat.1003877,Alphavirus mutator variants present host-speci...,Arboviruses cycle through both vertebrates and...,arboviruse cycle vertebrate invertebrate requi...,4,0.446771,-35.364479,True
1253,10.1016/j.antiviral.2013.07.008,An increased replication fidelity mutant of fo...,In a screen for RNA mutagen-resistant foot-and...,screen rna mutagen resistant foot mouth diseas...,4,-2.317497,-36.037506,True
1476,10.1073/pnas.1111650108,Arbovirus high fidelity variant loses fitness ...,The error rate of RNA-dependent RNA polymerase...,error rate rna dependent rna polymerase rdrp a...,4,-1.226716,-35.700428,True
1560,10.1371/journal.ppat.1001163,Fidelity variants of RNA dependent RNA polymer...,"In a screen for RNA mutagen resistance, we iso...",screen rna mutagen resistance isolate high fid...,4,-0.76341,-34.913452,True
1760,10.1073/pnas.1232294100,A single mutation in poliovirus RNA-dependent ...,Ribavirin is a nucleotide analog that can be i...,ribavirin nucleotide analog incorporate viral ...,4,1.868214,-42.758049,True


In [25]:
# make a pretty plot
from bokeh.plotting import ColumnDataSource, figure, output_notebook, show, output_file, save
from bokeh.palettes import d3
import bokeh.models as bmo
#output_file("toolbar.html")
output_notebook()

TOOLTIPS = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("doi", "@doi"),
    ("title", "@title"),
]

# use whatever palette you want...
palette = d3['Category20'][len(df['y'].unique())]
color_map = bmo.CategoricalColorMapper(factors=df['y'].unique(),
                                   palette=palette)

p = figure(width=800, height=800, tooltips=TOOLTIPS,
           title="Mouse over the dots")
p.scatter('x_tsne', 'y_tsne', size=5, source=df[df['key'] == False], color={'field': 'y', 'transform': color_map})
p.scatter('x_tsne', 'y_tsne', size=8, source=df[df['key'] == True], color="black", marker="square")
show(p)


In [52]:
# references
#https://github.com/MaksimEkin/arXiv-Literature-Clustering

In [56]:
df[df['doi'].str.contains("78-16")]



Unnamed: 0,doi,title,abstract,processed_abstract,y,x_tsne,y_tsne,key
866,10.1128/jvi.00078-16,Poliovirus Polymerase Leu420 Facilitates RNA R...,<h4>Unlabelled</h4>RNA recombination is import...,h4 unlabelled</h4 rna recombination important ...,13,2.265277,-31.028799,False


In [None]:
dois_to_ignore = []