### Count-based vector space models (BOW, TFIGF): losing word relationship and semantics

In [18]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random

pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell
import plotly.graph_objs as go
#import chart_studio.plotly as py
#import cufflinks
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
#cufflinks.go_offline()
#cufflinks.set_config_file(world_readable=True,theme='ggplot')

### Data are the search results using PubMed with a few key words¶
https://www.ncbi.nlm.nih.gov/pubmed/

In [2]:
with open('Pubmed.txt', 'r') as f:
    #type(f.read())
    resultstring=f.read()

In [3]:
publist=re.split("PMID- ", resultstring)

In [4]:
len(publist)

9015

### Extract PMID, abstract, title for each publication

In [5]:
PMID=[]
abstract=[]
title=[]
for i in range(len(publist)):
    if len(publist[i])> 50:# only select real pub record
        #print(publist[i])
        PMID.append(re.search(r'^(\d*)\n',publist[i]).group(1))
        abstractRaw=publist[i].replace("\n", " ")
        abstractRaw=abstractRaw.replace("This article is protected by copyright.", " ")
        abstractRaw=abstractRaw.replace("All rights reserved.", " ")
        abstractRaw=re.sub(r"CI\s*-\s*", " ",abstractRaw)
        ab=re.search(r'AB\s*-(.*?)FAU\s*-', abstractRaw)
        ti=re.search(r'TI\s*-(.*?)\.', abstractRaw)
        if ab!=None:
            abstract.append(ab.group(1))
        else:
            abstract.append('')
        if ti!=None:
            title.append(ti.group(1))
        else:
            title.append('')
       


### Combine title and abstract

In [6]:
data = {'PMID':PMID,'title':title, 'abstract':abstract} 
df = pd.DataFrame(data) 
df.abstract=df.abstract+df.title
df.head()

Unnamed: 0,PMID,title,abstract
0,31161676,Zn-incorporation with graphene oxide on Ti su...,The poor osseointegration and postoperative b...
1,31159286,Impact of Different Titanium Implant Thread D...,Threads of dental implants with healing chamb...
2,31154539,Systemic and local effects of radiotherapy: a...,OBJECTIVES: Evaluate the modulating effect of...
3,31151775,In vitro proinflammatory gene expression chan...,BACKGROUND: The aim of this in vitro study wa...
4,31151256,Sandblasted and Acid Etched Titanium Dental I...,The field of dental implantology has made pro...


### Text preprocessing and cleaning

In [7]:
#REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,\;\\\\]')#\\\\ is used to escape backslash
nltk.download('stopwords')
letteronly = re.compile('[^a-z #+_]') # only keep letters and a few symbol
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wzhang\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
def clean_text(text):
    """
        text: a string  
        return: cleaned string
    """
    text = text.lower() # lowercase text
    text = letteronly.sub(' ', text) 
    ## stop words and length >1 removed at CountVectorize step
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS and len(word)>1) # remove stopwors from text
    return text
    
df['abstractClean'] = df['abstract'].apply(clean_text)

### Save the processed data

In [43]:
from sklearn.externals import joblib
joblib.dump(df, 'Cleaneddf.pkl')


['Cleaneddf.pkl']

In [9]:
df['abstractClean'].values[:2]

array([' the poor osseointegration and postoperative bacterial infection are prominently       responsible for the failure of titanium  ti  based implant in clinic  to address        above issues  methacryloyl modified graphene oxide  goma  as zinc ions  zn  +           reservoir and release platform was fabricated on the ti substrates with cathode       electrophoresis deposition  epd   afterwards  phenylboronic acid  pba        functionalization methacryloyl gelatin  gelma pba  was reacting with goma through       in situ free radical polymerization to prepare go zn gelma pba coating  the       obtained coating was confirmed by scanning electron microscopy  sem   x ray       photoelectron spectroscopy  xps  and zn ions release property  respectively  in       vitro cellular experiments including cell activity  alkaline phosphatase  alp         collagen secretion  extracellular matrix  ecm  mineralization  osteogenic genes       and proteins  revealed that go zn gelma pba coating was 

### Visualize Token (vocabulary) Frequency Distribution Before Removing Stop Words

In [10]:
vec = CountVectorizer().fit(df['abstractClean'])
bag_of_words = vec.transform(df['abstractClean'])
bag_of_words.shape

(9014, 23825)

In [14]:
type(sum_words)

numpy.matrix

In [15]:
sum_words

matrix([[ 6, 24,  9, ...,  1,  2,  3]], dtype=int64)

In [13]:
sum_words = bag_of_words.sum(axis=0) 
sum_words.shape

(1, 23825)

In [16]:
vec.vocabulary_.items()



In [19]:
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq[:5]

[('the', 123450),
 ('poor', 350),
 ('osseointegration', 9099),
 ('and', 73723),
 ('postoperative', 513)]

In [20]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['abstractClean'], 20)
df1 = pd.DataFrame(common_words, columns = ['abstractClean' , 'count'])
df1.head()

Unnamed: 0,abstractClean,count
0,the,123450
1,of,84245
2,and,73723
3,in,49281
4,to,38024


In [23]:
df1['count'].sort_values()[:5]

19     9198
18     9736
17     9766
16    10265
15    10321
Name: count, dtype: int64

In [22]:
df1.groupby('abstractClean').sum()['count'].sort_values()[:5]

abstractClean
is        9198
after     9736
study     9766
this     10265
that     10321
Name: count, dtype: int64

In [21]:
df1.groupby('abstractClean').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words before removing stop words')


AttributeError: 'Series' object has no attribute 'iplot'

### Visualize Token (vocabulary) Frequency Distribution After Removing Stop Words

In [16]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['abstractClean'], 20)
df2 = pd.DataFrame(common_words, columns = ['abstractClean' , 'count'])
df2.groupby('abstractClean').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words after removing stop words')

### Bigrams Frequency Distribution Before Removing Stop Word

In [17]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['abstractClean'], 20)
df3 = pd.DataFrame(common_words, columns = ['abstractClean' , 'count'])
df3.groupby('abstractClean').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams before removing stop words')

### Bigrams Frequency Distribution After Removing Stop Word

In [18]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['abstractClean'], 20)
df4 = pd.DataFrame(common_words, columns = ['abstractClean' , 'count'])
df4.groupby('abstractClean').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams After removing stop words')

### Trigrams Frequency Distribution Before Removing Stop Word

In [19]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['abstractClean'], 20)
df5 = pd.DataFrame(common_words, columns = ['abstractClean' , 'count'])
df5.groupby('abstractClean').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams before removing stop words')

In [20]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['abstractClean'], 20)
df6 = pd.DataFrame(common_words, columns = ['abstractClean' , 'count'])
df6.groupby('abstractClean').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams after removing stop words')

### Abstract+title Length Distribution

In [21]:
df['word_count'] = df['abstract'].apply(lambda x: len(str(x).split()))
desc_lengths = list(df['word_count'])

print("Number of publications:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))

Number of publications: 9014 
Average word count 226.72842245396052 
Minimum word count 1 
Maximum word count 934


In [22]:
df['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution')

In [23]:
df.set_index('PMID', inplace = True)

### Use linear_kernel to compare Tfidf vectors, equal to dot product

In [24]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['abstractClean'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [25]:
cosine_similarities[0]

array([1.00000000e+00, 1.92501046e-03, 4.08680011e-03, ...,
       7.69268338e-04, 2.76236677e-03, 4.45844797e-03])

In [26]:
indices = pd.Series(df.index)
indices[:5]

0    31161676
1    31159286
2    31154539
3    31151775
4    31151256
Name: PMID, dtype: object

In [27]:
def recommendations(PMID, cosine_similarities):
    
    recommended_pub = []
    notrecommended_pub = []
    # getting the index of the publication that matches the PMID
    idx = indices[indices == PMID].index[0]
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)
    # getting the indexes of the 10 most similar publications except itself
    top_10_indexes = list(score_series.iloc[1:11].index)
    bottom_10_indexes = list(score_series.iloc[:-10:-1].index)
    
    # populating the list with the PMID  of the top 10 matching publications
    for i in top_10_indexes:
        recommended_pub.append(list(df.index)[i]) 
    for i in bottom_10_indexes:
        notrecommended_pub.append(list(df.index)[i]) 
    return recommended_pub,notrecommended_pub

### Select top 10 and bottom 10 similar publications

In [28]:
recommendations('25619479',cosine_similarities) 

(['21682427',
  '26936559',
  '25262877',
  '30075648',
  '22916620',
  '20123866',
  '30678295',
  '29724028',
  '22916621',
  '24905878'],
 ['11813663',
  '29851432',
  '25390863',
  '7902448',
  '25764595',
  '9118279',
  '9206397',
  '9470600',
  '24678866'])

### Use cosine_similarities to compare tfidf vectors, same as lineal kernal (also called dot-product) since data is normalized
Cosine similarity only cares about angle difference, while dot product cares about angle and magnitude. If you normalize your data to have the same magnitude, the two are indistinguishable.

In [29]:
type(tfidf_matrix)

scipy.sparse.csr.csr_matrix

In [30]:
from sklearn.metrics.pairwise import cosine_similarity  

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['abstractClean'])
tfidf_matrix

<9014x1457328 sparse matrix of type '<class 'numpy.float64'>'
	with 3022034 stored elements in Compressed Sparse Row format>

In [31]:
cosine_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_similarity
recommendations('25619479',cosine_similarity)

array([[1.00000000e+00, 1.92501046e-03, 4.08680011e-03, ...,
        7.69268338e-04, 2.76236677e-03, 4.45844797e-03],
       [1.92501046e-03, 1.00000000e+00, 2.18761935e-02, ...,
        9.73180593e-03, 3.52168900e-03, 1.26797034e-02],
       [4.08680011e-03, 2.18761935e-02, 1.00000000e+00, ...,
        2.92793931e-03, 8.60953121e-03, 9.74783959e-03],
       ...,
       [7.69268338e-04, 9.73180593e-03, 2.92793931e-03, ...,
        1.00000000e+00, 1.28736833e-03, 3.27100266e-03],
       [2.76236677e-03, 3.52168900e-03, 8.60953121e-03, ...,
        1.28736833e-03, 1.00000000e+00, 1.48272003e-02],
       [4.45844797e-03, 1.26797034e-02, 9.74783959e-03, ...,
        3.27100266e-03, 1.48272003e-02, 1.00000000e+00]])

(['21682427',
  '26936559',
  '25262877',
  '30075648',
  '22916620',
  '20123866',
  '30678295',
  '29724028',
  '22916621',
  '24905878'],
 ['11813663',
  '29851432',
  '25390863',
  '7902448',
  '25764595',
  '9118279',
  '9206397',
  '9470600',
  '24678866'])

### Review results- pick top one match 

In [32]:
df.loc['25619479']['abstract'] # original

' Dental implant stability, which is an important parameter for the surgical       outcome, can now be assessed using quantitative ultrasound. However, the       acoustical propagation in dental implants remains poorly understood. The       objective of this numerical study was to understand the propagation phenomena of        ultrasonic waves in cylindrically shaped prototype dental implants and to       investigate the sensitivity of the ultrasonic response to the surrounding bone       quantity and quality. The 10-MHz ultrasonic response of the implant was       calculated using an axisymetric 3D finite element model, which was validated by       comparison with results obtained experimentally and using a 2D finite difference        numerical model. The results show that the implant ultrasonic response changes       significantly when a liquid layer is located at the implant interface compared to       the case of an interface fully bounded with bone tissue. A dedicated model based 

In [33]:
# Very good match
df.loc['21682427']['abstract'] # top one match

' Osseointegration of dental implants remains poorly understood. The objective of       this numerical study is to understand the propagation phenomena of ultrasonic       waves in prototypes cylindrically shaped implants and to investigate the       sensitivity of their ultrasonic response to the surrounding bone biomechanical       properties. The 10 MHz ultrasonic response of the implant was calculated using a        finite difference numerical simulation tool and was compared to rf signals taken        from a recent experimental study by Mathieu et al. [Ultrasound Med. Biol. 37,       262-270 (2011a)]. Reflection and mode conversion phenomena were analyzed to       understand the origin of the different echoes and the importance of lateral wave        propagation was evidenced. The sensitivity of the ultrasonic response of the       implant to changes of (i) amount of bone in contact with the implant, (ii)       cortical bone thickness, and (iii) surrounding bone material propertie

In [34]:
# bottom one match 
df.loc['11813663']['abstract'] # not very good match. this PMID doesn't have abstract. the following string comes from title

' Teeth in a day'

Room to improve: corpus not big, need fine tune parameters

To do: to understand if there is any advantage of using vector embedding models compared with TfidfVector/cosine similarity. 