In [2]:
import pandas as pd
from bio import Entrez
import re
import numpy as np
import time

import Levenshtein as lev
import nltk

ModuleNotFoundError: No module named 'bio'

# query search results from pubmed

excellent tutorial: http://biopython.org/DIST/docs/tutorial/Tutorial.html
probably PMC will be even more exciting than PubMed as it contains full text articles: https://www.nlm.nih.gov/bsd/difference.html


In [2]:
def search_pubmed(querystring):
    """
    send query to pubmed and get back results in xml
    the details of individual publications are retrieved later
    """
    Entrez.email = 'stefanie.tenberg@awk.ch' # should be always stated    
    handle = Entrez.esearch(db='pubmed',
                            retmode='xml', 
                            term=querystring,
                            usehistory="y")
    results = Entrez.read(handle)
    handle.close()
    return results

In [3]:
# query pubmed

# generate query here (if required): https://www.ncbi.nlm.nih.gov/pubmed/advanced
# example: ((Novartis[Affiliation]) AND Biologics) AND Lucentis  ADC OR (Antibody-drug conjugate)
querystring="covid 19"
search_results=search_pubmed(querystring)
count = int(search_results["Count"])
print("Found %i results" % count)

Found 96314 results


In [None]:
# fetch all publication details and write them to txt

batch_size = 250 #number of publications to fetch at a time
t0=time.time()

filepath="pubmed_data_query_"+querystring.replace(" ","_")+".txt" #save txt here
out_handle = open(filepath, "w")
for start in range(0, count, batch_size): #loop over all publications found
    end = min(count, start + batch_size)
    print("Going to download record %i to %i of %i" % (start + 1, end, count))
    print("time elapsed: %.0d s" % (time.time()-t0))
    fetch_handle = Entrez.efetch(
        db="pubmed",
        rettype="medline",
        retmode="text",
        retstart=start,
        retmax=batch_size,
        webenv=search_results["WebEnv"],
        query_key=search_results["QueryKey"],
    )
    data = fetch_handle.read()
    fetch_handle.close()
    out_handle.write(data)
out_handle.close()
print("done time elapsed: %.0d s" % (time.time()-t0))

# Parse txt file to retrieve title, keywords, abstract, author affiliations, ...

great parsing intro: https://www.vipinajayakumar.com/parsing-text-with-python/

In [None]:
if 'filepath' in locals():
    del filepath
filepath="pubmed_data_query_antibody_drug_conjugate.txt"

In [None]:
# read txt and inspect it
# with open(filepath) as file:
#     file_contents = file.read()
#     print(file_contents)
# file.close()

In [None]:
# define the regexp we are looking for
rx_dict = {
    'pub_PMID': re.compile(r'PMID- (?P<pub_PMID>\d+)\n'),
    'pub_title': re.compile(r'TI  - (?P<pub_title>.*)\n'),
    'pub_keywords': re.compile(r'OT  - (?P<pub_keywords>.*)\n'),
    'pub_abstract': re.compile(r'AB  - (?P<pub_abstract>.*)\n'),
    'pub_affiliation': re.compile(r'AD  - (?P<pub_affiliation>.*)\n'),
    'line_break': re.compile(r'      (?P<line_break>.*)\n'),
}

In [None]:
def _parse_line(line):
    """
    Do a regex search against all defined regexes and
    return the key and match result of the first matching regex
    
    https://www.vipinajayakumar.com/parsing-text-with-python/

    """

    for key, rx in rx_dict.items():
        match = rx.search(line)
        if match:
            return key, match
            
    # if there are no matches
    return None, None



def parse_file(filepath):
    """
    Parse text at given filepath

    Parameters
    ----------
    filepath : str
        Filepath for file_object to be parsed

    Returns
    -------
    data : pd.DataFrame
        Parsed data

    """
    cnt=0  
    data=[] # create an empty list to collect the data
    pub_keywords=[]
    last_key=[]
    
    # open the file and read through it line by line
    with open(filepath, 'r') as file_object:
        line = file_object.readline()
        while line:
            # at each line check for a match with a regex
            key, match = _parse_line(line)

            if key == 'pub_PMID': #first line of a new publication is always PMID
                if cnt>0:
                    data.append({'PMID': pub_PMID,\
                                 'title':pub_title,\
                                 'keywords':pub_keywords,\
                                 'abstract':pub_abstract,\
                                 'affiliation': list(set(pub_affiliation))\
                                })

                cnt+=1
                pub_keywords=[]
                pub_affiliation=[]
                pub_abstract=[]
                pub_title=[]
                pub_PMID = match.group('pub_PMID')
            elif key == 'pub_title':
                pub_title = match.group('pub_title')             
            elif key == 'pub_keywords':
                pub_keywords.append(match.group('pub_keywords').lower())
            elif key == 'pub_affiliation':
                pub_affiliation.append(match.group('pub_affiliation'))
            elif key == 'pub_abstract':
                pub_abstract=match.group('pub_abstract')
            elif (key == 'line_break'):
                if last_key:
                    nline=match.group('line_break')
                    if last_key == 'pub_keywords':
                        pub_keywords[-1]=pub_keywords[-1]+' '+nline.lower()
                    if last_key == 'pub_affiliation':
                        pub_affiliation[-1]=pub_affiliation[-1]+' '+nline
                    elif last_key == 'pub_title':
                        pub_title = pub_title + ' '+nline
                    elif last_key == 'pub_abstract':
                        pub_abstract = pub_abstract + ' '+nline
              
            if not(key == 'line_break'):
                last_key=key
            line = file_object.readline()

        # create a pandas DataFrame from the list of dicts
        
        # set the School, Grade, and Student number as the index
#         data.set_index(['TI'], inplace=True)
        # consolidate df to remove nans
#         data = data.groupby(level=data.index.names).first()
        # upgrade Score from float to integer
#         data = data.apply(pd.to_numeric, errors='ignore')
    return data

In [None]:
data = parse_file(filepath)

In [None]:
for k in data[0].keys():
    print("{}:\t {}".format(k,data[0][k]))

In [None]:
# uk=pd.Series([item for d in data[0:500] for item in d['keywords']]).value_counts().index
# print("%i unique keywords" % (len(uk)))
# data=data[0:500]

In [None]:
filepath[:-3]+'json'
import json
with open(filepath[:-3]+'json', 'w') as fp:
    json.dump(data, fp)

# clean data

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')

import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist, squareform

def clean_keyword_list(keywords):
    keywords=[k.strip("*") for k in keywords] #remove *
    keywords=[k.replace("-"," ") for k in keywords] #remove -
    keywords=[k for k in keywords if not(k.lower()=="none")] #remove "none"
    lemmatizer=WordNetLemmatizer()
    keywords_temp=[]
    for k in keywords: 
        keywords_temp.append( ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(k)])) #to singular
    keywords=keywords_temp
    keywords=[k if not(k=="adc") else "antibody drug conjugate" for k in keywords] #adc=antibody drug conjugate
    regex=re.compile('antibody drug conjugate (.*)')
    keywords=[k if not(re.match(regex,k)) else "antibody drug conjugate" for k in keywords] #remove "antibody drug conjugate ( adc )"
    regex=re.compile('adc(.*)antibody drug conjugate(.*)')
    keywords=[k if not(re.match(regex,k)) else "antibody drug conjugate" for k in keywords] #remove "antibody drug conjugate ( adc )"
    regex=re.compile('adc(.*)antibody drug conjugate(.*)')
    keywords=[re.sub(r"[^\w]", " ", k) for k in keywords] #remove "periods"
    keywords=[re.sub(' +', ' ', k) for k in keywords] #remove multiple spaces
    keywords=[re.sub(' +', ' ', k).strip() for k in keywords] #remove multiple spaces and the trailing space
    keywords=[k if not(k=="adc") else "antibody drug conjugate" for k in keywords] #adc=antibody drug conjugate
    keywords=list(set(keywords)) #remove duplicates
    return keywords

def group_similar_keywords(keyword_list):
    df_keywords=pd.DataFrame(data=keyword_list,columns=['keyword'])
    df_keywords['counts']=1
    df_keywords=df_keywords.groupby('keyword').count().sort_values(by='counts', ascending=False).reset_index()
    print(df_keywords.head(10))

    t0=time.time()
    print('calculating Levenshtein distances...')
    keyword_list_unique=df_keywords['keyword'].values
    dst=pdist(np.array(keyword_list_unique).reshape(-1,1), lambda x,y: 1-lev.ratio(x[0],y[0]))
    print('done, time elapsed: %i s' % (time.time()-t0))

    Z=sch.linkage(dst,'ward')
    cluster_assignments=sch.fcluster(Z,0.1, criterion='distance')
    freq=np.bincount(cluster_assignments)
    cluster_ids=np.arange(len(freq))[freq>1]

    df_keywordcluster=pd.DataFrame(data=[],columns=['grouped_keywords','master_keyword'])
    for cid in cluster_ids:
        temp_keywordlist=keyword_list_unique[[np.arange(len(keyword_list_unique))[cluster_assignments==cid]]]
        temp_counts=[df_keywords.loc[lambda x: x.keyword==kw,'counts'].values[0] for kw in temp_keywordlist]
        temp_winner_keyword=temp_keywordlist[np.array(temp_counts).argmin()]
        df_keywordcluster=df_keywordcluster.append(pd.DataFrame([[temp_keywordlist,\
                                                                  temp_winner_keyword]],\
                                                                columns=df_keywordcluster.columns), ignore_index=True)

    print(df_keywordcluster.to_string())#  
    df_keywordcluster=pd.DataFrame(df_keywordcluster['grouped_keywords'].to_list(), index=df_keywordcluster.master_keyword).stack().reset_index(name='old_keyword')
    df_keywordcluster=df_keywordcluster.drop(columns=['level_1'])
    
    return df_keywordcluster

def replace_similar_keywords(keywords, df_keyword_simplification):
    for cnt,k in enumerate(keywords):
        if df_keyword_simplification['old_keyword'].isin([k]).sum():
             keywords[cnt]=df_keyword_simplification['master_keyword'][df_keyword_simplification['old_keyword'].isin([k])].values[0]
#             print(df_keyword_simplification['master_keyword'][df_keyword_simplification['old_keyword'].isin([k])][0],df_keyword_simplification['old_keyword'][df_keyword_simplification['old_keyword'].isin([k])][0])
#             print(df_keyword_simplification['master_keyword'][df_keyword_simplification['old_keyword'].isin([k])].values[0],df_keyword_simplification['old_keyword'][df_keyword_simplification['old_keyword'].isin([k])].values[0],'\n')
#         keywords[cnt]=k
    return keywords



In [None]:
print('initially %d keywords' % len(np.unique([kw for d in data for kw in d['keywords']])))

for cnt,d in enumerate(data):
    for key in d:
        if key == "keywords":
            data[cnt][key]=clean_keyword_list(d[key])
            
print('after cleaning %d keywords' % len(np.unique([kw for d in data for kw in d['keywords']])))

In [None]:
keyword_list=[kw for d in data for kw in d['keywords']]
df_keyword_simplification=group_similar_keywords(keyword_list)

print('initially %d keywords' % len(np.unique([kw for d in data for kw in d['keywords']])))

for cnt,d in enumerate(data):
    for key in d:
        if key == "keywords":
            data[cnt][key]=replace_similar_keywords(d[key],df_keyword_simplification)
            
print('after keyword similarity check: %d keywords' % len(np.unique([kw for d in data for kw in d['keywords']])))

In [None]:
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist, squareform
# dendrogram = sch.dendrogram(sch.linkage(points, method='ward'))

In [None]:
for cnt in range(len(lev_ratio)-1):
     if lev.ratio(keyword_list[cnt],keyword_list[cnt+1])>0.9:
            print([cnt, keyword_list[cnt], keyword_list[cnt+1], lev.ratio(keyword_list[cnt],keyword_list[cnt+1]), lev.distance(keyword_list[cnt],keyword_list[cnt+1])])

In [None]:
pd.DataFrame(data=keyword_list, columns=['kw'])['kw'].value_counts()

In [None]:
df_keywords['keyword'].values

In [None]:
# sch.ward(lev_ratio)
t0=time.time()
d=pdist(np.array(keyword_list).reshape(-1,1), lambda x,y: 1-lev.ratio(x[0],y[0]))
print(time.time()-t0)

In [None]:
print(squareform(d))

In [None]:
print(keyword_list[56],keyword_list[89])

In [None]:
Z=sch.linkage(d,'ward')
# np.array(keyword_list)[sch.fcluster(Z,0.3, criterion='distance')]
a=sch.fcluster(Z,0.1, criterion='distance')

In [None]:
freq=np.bincount(a)
cluster_ids=np.arange(len(freq))[freq>1]
for cid in cluster_ids:
    print(np.array(keyword_list)[[np.arange(len(keyword_list))[a==cid]]])

In [None]:
np.array(keyword_list)[[1619,1988]]

In [None]:
import sklearn.cluster
import distance

words = "YOUR WORDS HERE".split(" ") #Replace this line
words = np.asarray(words) #So that indexing with a list will work
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

In [None]:
import matplotlib.pyplot as plt
plt.figure()
dn=sch.dendrogram(Z)
plt.show()

In [None]:
lev.ratio('adc','antibody drug conjugate')
# words=np.asarray(a)
# for cluster_id in np.unique(affprop.labels_):
#     exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
#     cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
#     cluster_str = ", ".join(cluster)
#     print(" - *%s:* %s\n" % (exemplar, cluster_str))
regex=re.compile('(.*) antibody drug conjugate')
regex=re.compile('adc(.*)antibody drug conjugate(.*)')
for item in keyword_list:
    if (re.match(regex,item)):
        print(item)

In [None]:
keyword_list

In [None]:
# for i in range(len(lev_ratio[0])):
for cnt1 in range(len(keyword_list)):
    for cnt2 in range(len(keyword_list)):
        if (1-lev_ratio[cnt1][cnt2]<0.1) and (1-lev_ratio[cnt1][cnt2]>0):
            print([keyword_list[cnt1],keyword_list[cnt2],1-lev_ratio[cnt1][cnt2]])
        if cnt2>20000:
            break
    if cnt1>500:
        break
        
    

# build list of keywords that appear together

In [None]:
link_list_filepath=filepath[0:-4]+"_keywordlinks.csv"
link_list_counted_filepath=filepath[0:-4]+"_keywordlinks_counted.csv"
count=len(data)
link_list=[]
cnt=0
for cnt, pubdata in enumerate(data): #loop through all publications and establish pairwise links of keywords
    keywords=pubdata['keywords']
    if not[]:
        keywords=clean_keyword_list(keywords)
        for i in np.arange(0,len(keywords)-1):
            for k in np.arange(i+1,len(keywords)):
                keywordpair_temp=[keywords[i].lower(),keywords[k].lower()]  #alphabetical order to ensure that pairs A/B\
                                                                            #do not show as B/A
                                                                            #lower case to avoid issues with inconsistent
                                                                            #capitalization
                keywordpair_temp.sort()
                keywordpair_temp.append(pubdata['PMID']) #append PMID to avoid double-countings at a later stage
                link_list.append(keywordpair_temp)
                
    if not(cnt%1000):
        print("pairwise linking of keywords: %i of %i done" % (cnt, count))
        
        
link_list_df=pd.DataFrame(data=link_list,columns=["keyword1","keyword2","PMID"])
link_list_df.to_csv(link_list_filepath)
link_list_df=link_list_df.drop_duplicates()
link_list_df=link_list_df.groupby(["keyword1","keyword2"]).count().reset_index() #count how often each pair appears
link_list_df=link_list_df.rename(columns={"PMID":"counts"})

print("established %i keyword links" % (len(link_list)))   
print("established %i unique keyword links" % (len(link_list_df)))
link_list_df.to_csv(link_list_counted_filepath)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
uniquekeyword_series=pd.Series([item for d in data for item in d['keywords']])
uk=uniquekeyword_series.value_counts()
maxcat=20
f=plt.figure(figsize=(16, 11))
ax=sns.barplot(x=uk.index[0:maxcat],y=uk.to_list()[0:maxcat])
for item in ax.get_xticklabels():
    item.set_rotation(90)
ax.set_title('Top20 keywords'+"\n"+filepath[0:-4])
ax.set_ylabel('counts')
f.savefig(filepath[0:-4]+"_topkeywords.png",bbox_inches='tight')

In [None]:
plt_df=link_list_df.copy()
plt_df['label']="["+plt_df['keyword1']+"] - ["+plt_df['keyword2'] + "]"
plt_df=plt_df.sort_values(by=['counts'], ascending=False)
plt_df['label']="["+plt_df['keyword1']+"] - ["+plt_df['keyword2'] + "]"
maxcat=20

f=plt.figure(figsize=(16, 11))
ax=sns.barplot(x=plt_df.iloc[0:maxcat]['label'],y=plt_df.iloc[0:maxcat]['counts'])
for item in ax.get_xticklabels():
    item.set_rotation(90)
ax.set_title('Top20 keyword pairs'+"\n"+filepath[0:-4])
ax.set_ylabel('counts')
f.savefig(filepath[0:-4]+"_topkeywordpairs.png",bbox_inches='tight')

In [None]:
data[0]['affiliation']

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(['German'])

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_lg
nlp=spacy.load('en_core_web_lg')

In [None]:
not(not([]))

In [None]:
for ind in range(50):
    print(ind)
    doc=data[ind]['affiliation'][0].split(',')
    doc=[item.strip() for item in doc]
    doc_temp=[]
    for cnt, d in enumerate(doc):
        if (cnt>0) & (d in ["inc.", "inc", "corp", "corp."]):
            doc_temp[cnt-1]=doc_temp[cnt-1]+" "+d
        else:
            doc_temp.append(d)
    
    doc = [nlp(item) for item in doc_temp]
    for d in doc:
        print([(X, X.text, X.label_) for X in d.ents])
    
    doc = [nlp(item) for item in data[ind]['affiliation'][0]]
    for d in doc:
        print([(X, X.text, X.label_) for X in d.ents])
        
    print(data[ind]['affiliation'][0].split(','))
    print(doc_temp)


In [None]:
for ind in range(50):
    doc=data[ind]['affiliation'][0].split(',')
    print(doc[0:2])