In [1]:
#pip install biopython
from Bio import Entrez

Abstract scrapping from first 100 search results

In [2]:
def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='100', #100 search results
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

if __name__ == '__main__':
    results = search('neurodegenerative diseases')
    id_list = results['IdList']
    papers = fetch_details(id_list)

Converting 100 abstracts to a dataframe

In [26]:
Abs=[]
for i, paper in enumerate(papers['PubmedArticle']):
    Abs.append(paper['MedlineCitation']['Article']['ArticleTitle'])
Abs=[item.lower() for item in Abs]
import pandas as pd
data=pd.DataFrame()
data['Abstract']=Abs
data.head(10)

Unnamed: 0,Abstract
0,molecular chaperones biochemistry and role in ...
1,microrna dysregulation in neurodegenerative di...
2,potential for therapeutic use of hydrogen sulf...
3,dietary inflammatory potential and the risk of...
4,"hypertension, diabetes and neurodegenerative d..."
5,targeting purinergic signaling and cell therap...
6,regulatory roles of the mir-200 family in neur...
7,regulation of autophagy in neurodegenerative d...
8,do microglial sex differences contribute to se...
9,exosome biomarkers revolutionize preclinical d...


Excluding the punctuation marks to clean the data

In [27]:
import string

data['cleaned'] = data['Abstract'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

10 most frequently occuring words from the corpus

In [28]:
from collections import Counter
split_it = "".join(data.cleaned).split()
Counter = Counter(split_it) 
most_occur = Counter.most_common(10) 
most_occur

[('neurodegenerative', 100),
 ('in', 72),
 ('of', 68),
 ('and', 53),
 ('diseases', 34),
 ('the', 31),
 ('a', 25),
 ('for', 20),
 ('role', 9),
 ('to', 9)]

Excluding words from most_occur list

In [29]:
most_occur1=[a for a,b in most_occur]
data['cleaned']=data['cleaned'].str.replace('|'.join(most_occur1), '')
data['cleaned']

0                  moleculr chperones biochemistry     
1             microrn dysregultion     systemtic review
2     potentil  rpeutic use  hydrogen sulfide  oxidt...
3               dietry flmmry potentil   risk     dults
4     hypertension dibetes    is re  clicl lk throug...
                            ...                        
95    emergg   genetic ltertions ffectg exosome biol...
96                            hsp90  its cochperones   
97     neuropthologicl  clicl dignostic criteri  chr...
98    chemicl bsis  rective oxygen species rectivity...
99               michondril dynmics  key executioner   
Name: cleaned, Length: 100, dtype: object

In [30]:
most_occur1

['neurodegenerative',
 'in',
 'of',
 'and',
 'diseases',
 'the',
 'a',
 'for',
 'role',
 'to']

Matrix of vectors for each abstract

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(data['cleaned'])

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=data['cleaned'])
df.head(2)

Unnamed: 0_level_0,19902016,2016,2020,3like,be,behvior,berbere,between,biluids,biochemistry,...,volvement,wdow,weldg,wht,with,world,wrd,xis,ykl40,αsynucle
cleaned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
moleculr chperones biochemistry,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
microrn dysregultion systemtic review,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Converting corpus into 6 clusters using K-means clustering using cosine similarity as the distance measurement

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['cleaned'])

#cluster documents    
true_k = 6 #number of clusters
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=6, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

Top 3 unique words from each cluster which are close to the centroid of the cluster 

In [34]:
#print top terms per cluster clusters    
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print ("Cluster %d:" % i,)
    for ind in order_centroids[i, :3]:
        print(' %s' % terms[ind])
    print()
    print()

Top terms per cluster:
Cluster 0:
 dysfunction
 olfcry
 prote


Cluster 1:
 review
 stem
 cells


Cluster 2:
 moleculr
 key
 common


Cluster 3:
 oxidtive
 rpeutic
 stress


Cluster 4:
 potentil
 trget
 sleep


Cluster 5:
 erly
 chronic
 neuropthy


