In [85]:
import numpy as np
import pandas as pd
import faiss                   # make faiss available
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import collections

## Topic 4: Efficient Vector Space Retrieval

Load data and vectorize using TfidfVectorizer
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

TF-IDF weighting

In [86]:
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])
corpus

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...
...,...,...
3188,MED-5367,relationship plasma carotenoids depressive sym...
3189,MED-5368,suicide mortality relation dietary intake num ...
3190,MED-5369,suicide mortality european union pubmed ncbi a...
3191,MED-5370,long chain omega num fatty acids intake fish c...


In [87]:
dataset = []

for i in corpus['TEXT']:
    a = []
    a.append(i)
    dataset.append(a)

print(len(dataset))
dataset

3193


[['alkylphenols human milk relations dietary habits central taiwan pubmed ncbi abstract aims study determine concentrations num nonylphenol np num octylphenol op num human milk samples examine related factors including mothers demographics dietary habits women consumed median amount cooking oil significantly higher op concentrations num ng/g consumed num ng/g num op concentration significantly consumption cooking oil beta num num fish oil capsules beta num num adjustment age body mass index bmi np concentration significantly consumption fish oil capsules beta num num processed fish products beta num num food pattern cooking oil processed meat products factor analysis strongly op concentration human milk num determinations aid suggesting foods consumption nursing mothers order protect infants np/op exposure num elsevier rights reserved '],
 ['phosphate vascular toxin pubmed ncbi abstract elevated phosphate levels advanced renal failure dysregulated calcium parathyroid hormone vitamin le

In [89]:
# calculate document frequency

DF = {}
for i in range(len(dataset)):
    doc = dataset[i][0] #string of doc
    a = []
    a = doc.split(" ")
    for w in a:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
DF

{'alkylphenols': {0, 1262, 1263, 1270, 1271, 1277, 1280},
 'human': {0,
  9,
  11,
  12,
  16,
  17,
  33,
  37,
  46,
  48,
  53,
  54,
  55,
  57,
  58,
  59,
  65,
  78,
  80,
  90,
  91,
  93,
  94,
  99,
  102,
  103,
  112,
  122,
  123,
  139,
  144,
  147,
  149,
  154,
  155,
  160,
  161,
  186,
  187,
  190,
  191,
  218,
  223,
  228,
  229,
  230,
  234,
  239,
  241,
  249,
  255,
  256,
  259,
  261,
  265,
  276,
  312,
  316,
  324,
  340,
  342,
  345,
  346,
  347,
  348,
  351,
  355,
  376,
  377,
  386,
  387,
  398,
  407,
  408,
  410,
  459,
  462,
  463,
  469,
  475,
  480,
  481,
  483,
  487,
  506,
  509,
  526,
  532,
  534,
  536,
  539,
  548,
  550,
  551,
  552,
  553,
  555,
  558,
  573,
  577,
  583,
  585,
  587,
  592,
  595,
  596,
  598,
  599,
  602,
  614,
  616,
  618,
  623,
  626,
  627,
  640,
  642,
  643,
  645,
  646,
  647,
  649,
  650,
  651,
  653,
  654,
  663,
  665,
  667,
  670,
  675,
  680,
  684,
  687,
  688,
  690,
  698,


In [90]:
for i in DF:
    DF[i] = len(DF[i])

DF

{'alkylphenols': 7,
 'human': 636,
 'milk': 141,
 'relations': 14,
 'dietary': 925,
 'habits': 91,
 'central': 96,
 'taiwan': 13,
 'pubmed': 2446,
 'ncbi': 2430,
 'abstract': 3172,
 'aims': 82,
 'study': 1528,
 'determine': 345,
 'concentrations': 429,
 'num': 2810,
 'nonylphenol': 9,
 'np': 5,
 'octylphenol': 5,
 'op': 11,
 'samples': 320,
 'examine': 195,
 'related': 383,
 'factors': 593,
 'including': 509,
 'mothers': 23,
 'demographics': 15,
 'women': 542,
 'consumed': 275,
 'median': 120,
 'amount': 128,
 'cooking': 67,
 'oil': 104,
 'significantly': 771,
 'higher': 621,
 'ng/g': 28,
 'concentration': 246,
 'consumption': 715,
 'beta': 36,
 'fish': 212,
 'capsules': 26,
 'adjustment': 134,
 'age': 567,
 'body': 399,
 'mass': 240,
 'index': 289,
 'bmi': 108,
 'processed': 110,
 'products': 350,
 'food': 712,
 'pattern': 112,
 'meat': 278,
 'factor': 286,
 'analysis': 499,
 'strongly': 86,
 'determinations': 10,
 'aid': 19,
 'suggesting': 101,
 'foods': 347,
 'nursing': 13,
 'order'

In [77]:
del DF['']
total_vocab = [x for x in DF]
print(len(total_vocab))
print(total_vocab)

26951


In [106]:
tf_idf = {}
for i in range(len(dataset)):
    doc = dataset[i][0]
    a = []
    a = doc.split(" ")
    counter = collections.Counter(a)
    words_count = len(a)
    for token in np.unique(a):
        tf = counter[token]/words_count
        df = DF[token]
        idf = np.log(len(dataset)/(df+1))
        tf_idf[doc, token] = tf*idf

tf_idf

{('alkylphenols human milk relations dietary habits central taiwan pubmed ncbi abstract aims study determine concentrations num nonylphenol np num octylphenol op num human milk samples examine related factors including mothers demographics dietary habits women consumed median amount cooking oil significantly higher op concentrations num ng/g consumed num ng/g num op concentration significantly consumption cooking oil beta num num fish oil capsules beta num num adjustment age body mass index bmi np concentration significantly consumption fish oil capsules beta num num processed fish products beta num num food pattern cooking oil processed meat products factor analysis strongly op concentration human milk num determinations aid suggesting foods consumption nursing mothers order protect infants np/op exposure num elsevier rights reserved ',
  ''): -2.6536954252183527e-06,
 ('alkylphenols human milk relations dietary habits central taiwan pubmed ncbi abstract aims study determine concentra

In [64]:
print('dimensionality:') 
print(x.shape)
n, m = x.shape
print('\nvectors:')
print(x)

dimensionality:
(3193, 22019)

vectors:
  (0, 17333)	3.3356880518772125
  (0, 17593)	3.3324570312957658
  (0, 6427)	3.28520414644522
  (0, 7281)	3.0651422616684187
  (0, 10175)	5.11778561019353
  (0, 16367)	4.864336709383991
  (0, 14267)	4.3776814465458145
  (0, 13907)	6.429971999159699
  (0, 7919)	3.2168268490004834
  (0, 19434)	4.444056515490686
  (0, 595)	6.073297055220967
  (0, 5367)	6.671134055976587
  (0, 19253)	4.603121210120374
  (0, 912)	2.667112132047772
  (0, 7368)	3.3389295458013835
  (0, 12104)	3.4026026406625256
  (0, 14800)	4.332830880380462
  (0, 7916)	2.439666075337509
  (0, 16205)	6.405122543683322
  (0, 16179)	8.648194400823416
  (0, 2366)	4.368548962982541
  (0, 10096)	3.3820539724351377
  (0, 11988)	3.485533019993259
  (0, 2378)	3.014589982505587
  (0, 524)	2.624898072074517
  :	:
  (3192, 2020)	12.77053492020762
  (3192, 16106)	3.7411531599853767
  (3192, 5252)	23.961816048794514
  (3192, 20269)	4.8063494517336425
  (3192, 11666)	2.753671327252623
  (3192, 16498)	

In [65]:
x # ndarrays must be of numpy.float32, and not float64 for FAISS

<3193x22019 sparse matrix of type '<class 'numpy.float64'>'
	with 284752 stored elements in Compressed Sparse Row format>

In [66]:
x = x.astype('float32')

In [67]:
x.todense()


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

Cosine ranking

use sklearn.metrics.pairwise.cosine_similarity(X, Y=None, dense_output=True)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html

Similarity using FAISS

In [68]:
index = faiss.IndexFlatL2(m)   # build the index
print(index.is_trained)
index.add(x.todense())      # add vectors to the index
print(index.ntotal)

True
3193


# Implement a tiered index, with the configurable number of tiers

Break postings up into a hierarchy of lists: Most important --> Least important
Inverted index thus broken up into tiers of decreasing importance, based on TF

At query time use top tier unless it fails to yield K docs, if so drop to lower tiers

In [79]:
# obtain TF
inverse_idf = sp.diags(1/vectorizer.idf_,
                       offsets=0,
                       shape=(n, n),
                       format='csr',
                       dtype=np.float32).toarray()

tf = pd.DataFrame(inverse_idf*x, columns=features)
tf

Unnamed: 0,aa,aaa,aaas,aacr,aad,aap,aaph,aarp,aas,ab,...,zoophthora,zooplankton,zoxazolamine,zr,zu,zuccarini,zugesetztem,zusatzstoffe,zygote,zymography
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
# to form tiers with different ranges of tf:
#1. need to find max tf of term t_i
#2. find a range for tier j as (0, max tf(ti)/k) 

k = 2        #predefined number of tiers
maxtf = tf.max(axis = 0)
maxtf

aa              51.439777
aaa             50.255291
aaas             8.375882
aacr             7.123119
aad             46.096409
                  ...    
zuccarini        8.375882
zugesetztem      8.375882
zusatzstoffe     8.375882
zygote           8.375882
zymography       7.970417
Length: 22019, dtype: float32

In [82]:
# for each term t_i save Doc_IDs in corresponding tiers in nested dictionary

tiered = {}
for j in range(n):
    for i in range(m):
        for kk in range(k):
            a = []
            if tf[i][j] = 0:
                next i
            elif tf[i][j] < maxtf[j]/k:
                a.append(i)
            elif tf[i][j] < maxtf[j]/k:
                    tiered.update({k: []})
        
tiered

# for j in range(n):
    
#     for i in range(m):

TypeError: cannot convert dictionary update sequence element #0 to a sequence