In [150]:
import numpy as np
import pandas as pd
import faiss                   # make faiss available
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import collections
import math
import copy
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


## Topic 4: Efficient Vector Space Retrieval

In [128]:
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])
corpus

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...
...,...,...
3188,MED-5367,relationship plasma carotenoids depressive sym...
3189,MED-5368,suicide mortality relation dietary intake num ...
3190,MED-5369,suicide mortality european union pubmed ncbi a...
3191,MED-5370,long chain omega num fatty acids intake fish c...


In [129]:
# create token list out of document
def tokenize(string):
    return string.split()

# apply term frequencies for each a single string (document)
def tf(string): 
    # create bag of words from the string
    bow = tokenize(string)
    
    tf_dict = {}
    for word in bow:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1
            
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / len(bow)
    
    return tf_dict


In [130]:
# We then call our function on every doc and store all these tf dictionaries. 
tf_dict = {}
for index, row in corpus.iterrows():
    doc_dict = tf(row['TEXT'])
    tf_dict[index] = doc_dict

# test if tfDict was created correctly
#tf_dict[0]["alkylphenols"]
tf_dict
# alkylphenols for doc 0 : 0.008547008547008548

{0: {'alkylphenols': 0.008547008547008548,
  'human': 0.02564102564102564,
  'milk': 0.02564102564102564,
  'relations': 0.008547008547008548,
  'dietary': 0.017094017094017096,
  'habits': 0.017094017094017096,
  'central': 0.008547008547008548,
  'taiwan': 0.008547008547008548,
  'pubmed': 0.008547008547008548,
  'ncbi': 0.008547008547008548,
  'abstract': 0.008547008547008548,
  'aims': 0.008547008547008548,
  'study': 0.008547008547008548,
  'determine': 0.008547008547008548,
  'concentrations': 0.017094017094017096,
  'num': 0.13675213675213677,
  'nonylphenol': 0.008547008547008548,
  'np': 0.017094017094017096,
  'octylphenol': 0.008547008547008548,
  'op': 0.03418803418803419,
  'samples': 0.008547008547008548,
  'examine': 0.008547008547008548,
  'related': 0.008547008547008548,
  'factors': 0.008547008547008548,
  'including': 0.008547008547008548,
  'mothers': 0.017094017094017096,
  'demographics': 0.008547008547008548,
  'women': 0.008547008547008548,
  'consumed': 0.01709

In [131]:
# total number of documents in corpus
no_of_docs = len(corpus.index)
print(no_of_docs)

3193


In [132]:
# term - key, number of docs term occured in
def count_occurances():
    count_dict = {}
    for key in tf_dict:
        for key in tf_dict[key]:
            if key in count_dict:
                count_dict[key] += 1
            else:
                count_dict[key] = 1
    return count_dict

# test if count_occurances works
count_oc = count_occurances()
count_oc["alkylphenols"] # checked with Elina, good

# number of alkylphenols occurence in entire corpus = 7

7

In [133]:
# having total number of documents and number of occurances of each word in entire corpus we can calculate 
# idf for each term as log(total # of documents / # of documents with term in it)

# idf is calculated per each term, thus we create dictionary with term as a key and idf as a value
def idf():
    
    idf_dict = {}
    for key in count_oc:
        idf_dict[key] = math.log(no_of_docs/count_oc[key])
    return idf_dict

idf = idf()

# test if idf function works
idf["alkylphenols"]

# alkylphenols idf = 6.122806043659469

6.122806043659469

In [134]:
# cosntructing the final tf-idf dictionary; tf-idf is calculated as tf-idf(t, d) = tf(t, d) * idf(t)
# so for each key in tf dict we have to miltiply it with corresponsinf idf value

def tf_idf():
    d = copy.deepcopy(tf_dict)
    for doc, value in d.items():
        for word, value in d[doc].items():
            d[doc][word] = value * idf[word]
    return d

# test if tf_idf works
a = tf_idf()
print('Result from def:')
print(a[0]["alkylphenols"])

# excpected result for (term, doc) --> (alkylphenols, 0) =  0.008547008547008548 * 6.122806043659469 = 0.05
print('Manual result:')
idf["alkylphenols"] * tf_dict[0]["alkylphenols"]

# it works :)

Result from def:
0.05233167558683307
Manual result:


0.05233167558683307

In [135]:
a

{0: {'alkylphenols': 0.05233167558683307,
  'human': 0.04137224690704256,
  'milk': 0.07999887954709266,
  'relations': 0.04640734071025234,
  'dietary': 0.021178161627390717,
  'habits': 0.06081806301193047,
  'central': 0.029951863258520905,
  'taiwan': 0.04704074218165167,
  'pubmed': 0.002277836551005679,
  'ncbi': 0.002333928687010153,
  'abstract': 5.6398375726866955e-05,
  'aims': 0.03129911919188486,
  'study': 0.006299155752036891,
  'determine': 0.019018562185328398,
  'concentrations': 0.03431212433823531,
  'num': 0.017473699881981608,
  'nonylphenol': 0.0501836890203296,
  'np': 0.11041501333813132,
  'octylphenol': 0.05520750666906566,
  'op': 0.1938742194843218,
  'samples': 0.019661497409581282,
  'examine': 0.02389501396710287,
  'related': 0.018125480372086635,
  'factors': 0.014389075159974844,
  'including': 0.015694599796275722,
  'mothers': 0.08432858079975442,
  'demographics': 0.0458176580479707,
  'women': 0.015157693942522529,
  'consumed': 0.04191359136834549

In [139]:
# First we have to build TF-IDF matrix based on obtain dictionary. 
# Rows will correspond to docs in the corpus, while columns will represent unique words

#              word1       ...          wordn
#  doc1   tf_idf_value   ...      tf_idf_value
#  ...    tf_idf_value   ...      tf_idf_value
#  docn   tf_idf_value   ...      tf_idf_value
#

tf_idf_matrix = pd.DataFrame.from_dict(a, orient = 'index').fillna(0) # if word does not appear in doc we change NaN to
tf_idf_matrix = tf_idf_matrix.sort_index()
tf_idf_matrix.head(10)

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
0,0.052332,0.041372,0.079999,0.046407,0.021178,0.060818,0.029952,0.047041,0.002278,0.002334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001777,0.00182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.028372,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.022663,0.0,0.0,0.0,0.001625,0.001665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549,0.001588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.109472,0.0,0.007245,0.0,0.0,0.0,0.001559,0.001597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002082,0.002133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001945,0.001993,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.016298,0.0,0.0,0.0,0.0,0.0,0.0,0.002692,0.002758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [206]:
# Now we have to compare docs by computing cosine similarity between each vector (row) in dataframe
# For that we need to obtain 1. vector magintude 2. dot product between two vectors

def vector_magnitude(v):
    return np.linalg.norm(v)

def dot_product(v1, v2):
    return np.dot(v1,v2)

# Creating cosine similarity table (should be 3193 x 3193)
def cosine_similarity(v1, v2):
    return dot_product(v1, v2)/ (vector_magnitude(v1) * vector_magnitude(v2))
print(tf_idf_matrix.iloc[0])
cosine_similarity(tf_idf_matrix.iloc[0],tf_idf_matrix.iloc[0])

alkylphenols         0.052332
human                0.041372
milk                 0.079999
relations            0.046407
dietary              0.021178
                       ...   
eurostat             0.000000
upward               0.000000
suicide-recording    0.000000
scarcity             0.000000
trim-and-fill        0.000000
Name: 0, Length: 26951, dtype: float64


0.9999999999999996

## Preclustering suggested in the lecture

In [174]:
#Set number of clusters at initialisation time
sqrt_n = round(math.sqrt(no_of_docs))

#we randomly select sqrt(N) documents from the corpus, which we call leaders
leaders = tf_idf_matrix.sample(sqrt_n)
leaders = leaders.sort_index()
leaders

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
57,0.0,0.010084,0.0,0.0,0.0,0.0,0.0,0.0,0.001666,0.001707,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,0.0,0.011525,0.0,0.0,0.0,0.0,0.0,0.0,0.001904,0.00195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004676,0.004791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
339,0.0,0.0,0.034285,0.0,0.013615,0.0,0.0,0.0,0.002929,0.003001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003135,0.003213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0.0,0.0,0.0,0.0,0.020394,0.0,0.0,0.0,0.001097,0.001124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001731,0.001773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
463,0.0,0.008232,0.0,0.0,0.012642,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001559,0.001597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
# For every other document in the collection
# 1. Compute the similarities (cosine of the angle between TF-IDF vectors) with all leaders
# 2. Add the document to the cluster of the most similar leader

cluster_list = []

for i in range(sqrt_n):
    cluster_list.append([])

for i in range(no_of_docs):
    cosines = []
    for j in leaders.index:
        cosines.append(cosine_similarity(tf_idf_matrix.iloc[i],leaders.iloc[j]))
    m = max(cosines)
    index_of_max = [l for l, b in enumerate(cosines) if b == m]
    cluster_list[index_of_max[0]].append(i) #if there are two equal max values of cosine similarity use the smaller index by default
        

In [203]:
cluster_list

[[57,
  240,
  354,
  422,
  549,
  550,
  552,
  554,
  555,
  556,
  559,
  560,
  563,
  750,
  753,
  858,
  860,
  863,
  1366,
  1993,
  2351,
  2402,
  2533,
  2534,
  2788,
  2845],
 [37,
  38,
  40,
  64,
  65,
  66,
  67,
  156,
  216,
  595,
  597,
  598,
  625,
  628,
  661,
  824,
  848,
  873,
  995,
  1100,
  1205,
  1217,
  1218,
  1219,
  1260,
  1263,
  1281,
  1400,
  1421,
  1447,
  1468,
  1475,
  1476,
  1796,
  1978,
  1982,
  2055,
  2079,
  2093,
  2147,
  2328,
  2329,
  2330,
  2499,
  2503,
  2561,
  2617,
  2626,
  2670,
  2671,
  2953,
  2997,
  3054,
  3121],
 [241,
  243,
  249,
  1079,
  1084,
  1087,
  1096,
  1186,
  1391,
  1451,
  1545,
  1547,
  1551,
  1573,
  1582,
  2248,
  2260,
  2268,
  2334,
  2345,
  2421,
  2515,
  2644],
 [72,
  85,
  164,
  234,
  304,
  339,
  341,
  343,
  344,
  346,
  350,
  601,
  618,
  684,
  690,
  1013,
  1169,
  1431,
  1436,
  1450,
  1478,
  1501,
  1624,
  1625,
  1630,
  1782,
  1934,
  1981,
  2047,
  2150

In [200]:
# check of total docs (every doc should be included in exactly one cluster)
total = 0
for i in range(len(cluster_list)):
    total = total + len(cluster_list[i])

if total == no_of_docs:
    print('all docs are distributed to the clusters')

all docs are distributed to the clusters


In [209]:
#construct function, which uses query q(should be already in the vector form) as input, required similarity of the doc to be retrieved - threshold, and
#necessary number of documents to be retrieved - K (5 most similar docs in the cluster by default)

def ir_preclustering(q, threshold = 0, K = 5): 
    sim_to_leaders = [] #array of cosine similarities of q to leaders
    retrieved_docs = [] #array of the most similar docs to be returned by the function
    
    for i in leaders.index:
        sim_to_leaders.append(cosine_similarity(q,leaders.iloc[j]))
    m = max(sim_to_leaders)
    index_of_max = [l for l, b in enumerate(cosines) if b == m] #odinal number of most similar leader => use this cluster
    
    sim_to_docs = [] #array of cosine similarities of q to all docs in the chosen cluster
    for doc in cluster_list[index_of_max]:
        sim_to_docs.append(cosine_similarity(q,tf_idf_matrix.iloc[doc]))
        
    ins = np.argsort(sim_to_docs) #returns the indices that would sort an array of similarities to docs in accending order
    
    if threshold == 0: #proceed only with K
        for k in range(K):
            retrieved_docs.append(cluster_list[m][-k-1])

        df_retrieved_docs = tf_idf_matrix.iloc[retrieved_docs] #construct the dataframe of retrieved docs to be returned by the function
    
    else:
        if sim_to_docs[ins[0]] < threshold:
            print('no documents satisfy necessary level of threshold similarity')
            return None
        
        for sim in sim_to_docs:
            if sim >= threshold:
                retrieved_docs.append(cluster_list[m][sim_to_docs.index(sim)])
            if len(retrieved_docs) < K:
                print('number of documents that satisfy threshold similarity is less than required \(less than K\)')
            df_retrieved_docs = tf_idf_matrix.iloc[retrieved_docs]
        
    return df_retrieved_docs

## K-means clustering

In [145]:
#Set number of clusters at initialisation time
sqrt_n = round(math.sqrt(no_of_docs))

#Run the clustering algorithm
estimator = KMeans(n_clusters = sqrt_n)
model = estimator.fit(tf_idf_matrix)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=57, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [147]:
#Generate cluster predictions and store in y_hat
y_hat = estimator.predict(tf_idf_matrix)

In [153]:
#Since we have 57 clusters, we are going to compare the query vector with 57 vectors of cluster centroids
#All of cluster centroids are stored in the attribute cluster_centers
centers = np.array(model.cluster_centers_)
centers

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 5.42101086e-20,  2.74772295e-03,  8.67361738e-19, ...,
        -3.38813179e-21, -5.08219768e-21, -3.38813179e-21],
       [ 5.42101086e-20,  2.99921934e-03,  1.30104261e-18, ...,
        -5.08219768e-21,  0.00000000e+00, -3.38813179e-21],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.68367567e-03,  6.25360100e-03,  9.70343357e-03, ...,
        -3.38813179e-21, -6.77626358e-21,  3.38813179e-21]])