In [2]:
import numpy as np
import pandas as pd
import scipy.spatial.distance as dist
from sympy import *
from itertools import product

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [4]:
class TF_IDF:
    def __init__(self, **kwargs):
        self.method = kwargs.get('method')
        self.tf_matrix = kwargs.get('tf_matrix')
        self.document_dict = {}
        self.rounding_digit = kwargs.get('rounding_digit')
        for k,v in kwargs.items():
            if k not in ['method', 'tf_matrix', 'rounding_digit']:
                self.document_dict[k] = v
        if not self.tf_matrix:
            self.prepare_tf_matrix()
    
    def prepare_tf_matrix(self):
        self.tf_matrix = pd.DataFrame(index=self.document_dict.keys())
        for k,v in self.document_dict.items():
            v = v.split()
            for word in v:
                word_count = sum([1 for x in v if x == word])
                self.tf_matrix.loc[k,word] = word_count
        self.tf_matrix.fillna(0, inplace=True)
        self.tf_matrix = self.tf_matrix.astype(int)
        
    def get_tf_idf(self):
        if self.method == 'Cornell':
            self.cal_using_cornell()
        else: 
            self.cal_using_alternate()
            
    def cal_using_cornell(self):
        print('\nTF(Term frequency) is computed using the equation:')
        display(Symbol('TF(d,t) = 0'))
        print('if freq(d,t) = 0\n')
        display(Symbol('TF(d,t) = 1+log(1+log(freq(d,t)))'))
        print('otherwise')
        print('\n\nIDF(Inverse document frequency) is computed using the equation:')
        display(Eq(Symbol('IDF(t)') , log(Symbol('1+|d|')/Symbol('|d_{t}|'))))
        print('where d is the document collection, and d\u209C is the set of documents containing \
                \nterm t. If |d\u209C| << |d|, the term t will have a large IDF scaling factor\n\n')
        
        self.tf_idf_matrix  = pd.DataFrame(index=self.tf_matrix.index, columns=self.tf_matrix.columns)
        for idx in self.tf_matrix.index:
            print(f'\nDocument {idx}:')
            for col in self.tf_matrix.columns:
                print(f'\nWord: {col}')
                freq = self.tf_matrix.loc[idx, col]
                display(Eq(Symbol('freq(d,t)'), freq))
                if freq == 0:
                    tf = 0
                    display(Eq(Symbol('TF(d,t)'), 0))
                else:
                    tf = np.round(1+ np.log10(1+np.log10(freq)), self.rounding_digit)
                    display(Symbol(f'TF(d,t) = 1+log(1+log({freq})) = {tf}'))
                d_t = self.tf_matrix[col].astype(bool).sum()
                d = len(self.document_dict)
                idf = np.round(np.log10((1+d)/d_t), self.rounding_digit)
                display(Eq(Symbol('IDF(t)') , log(Symbol(f'1+{d}')/Symbol(f'{d_t}'))))
                display(Eq(Symbol('IDF(t)') , idf))
                tf_idf = np.round(tf*idf, self.rounding_digit)
                display(Symbol(f'TF*IDF = {tf}*{idf} = {tf_idf}'))
                self.tf_idf_matrix.loc[idx, col]  =tf_idf
                
                
            print('-------------------------------------------')
            
    def cal_using_alternate(self):
        print('\nTF(Term frequency) is computed using the equation:')
        
        display(Eq(Symbol('TF(d,t)') , 0.5 +  Symbol('0.5 * f(d,t)')/Symbol('MaxFreq(d)')))
       
        print('\n\nIDF(Inverse document frequency) is computed using the equation:')
        display(Eq(Symbol('IDF(t)') , 1 + log(Symbol('n')/Symbol('k'))))
        print('where d is the total no of docs, and k is the no of docs with teran t appearing\n\n')
        
        self.tf_idf_matrix  = pd.DataFrame(index=self.tf_matrix.index, columns=self.tf_matrix.columns)
        for idx in self.tf_matrix.index:
            print(f'\nDocument {idx}:')
            for col in self.tf_matrix.columns:
                print(f'\nWord: {col}')
                freq = self.tf_matrix.loc[idx, col]
                max_freq = self.tf_matrix[col].max()
                display(Eq(Symbol('f(d,t)'), freq))
               
                tf = np.round(0.5+ (0.5 * freq)/max_freq, self.rounding_digit)
                display(Eq(Symbol('TF(d,t)') , 0.5 +  Symbol(f'0.5 * {freq}')/Symbol(f'{max_freq}')))
                display(Eq(Symbol('TF(d,t)') , tf))
                k = self.tf_matrix[col].astype(bool).sum()
                n = len(self.document_dict)
                idf = np.round(1+np.log10((n)/k), self.rounding_digit)
                display(Eq(Symbol('IDF(t)') , 1 + log(Symbol('{n}')/Symbol('{k}'))))
                
                display(Eq(Symbol('IDF(t)') , idf))
                tf_idf = np.round(tf*idf, self.rounding_digit)
                display(Symbol(f'TF*IDF = {tf}*{idf} = {tf_idf}'))
                self.tf_idf_matrix.loc[idx, col]  =tf_idf
                
                
            print('-------------------------------------------')
    
    def calculate(self):
        print('Term Frequency Matrix:')
        display(self.tf_matrix)
        print(f'\nInitiating TF-TDF calculation using {self.method} method')
        
        self.get_tf_idf()
        
        print('\nTF-IDF Matrix:')
        display(self.tf_idf_matrix)
        
        self.print_doc_similarity()
    
    def print_doc_similarity(self):
        print('\n Document similarity:')
        
        doc_combs = [(i, j) for i, j in list(product(self.tf_idf_matrix.index, self.tf_idf_matrix.index)) if  i < j]
        
        max_sim = 0
        max_doc = None
        for comb in doc_combs:
            c1 = comb[0]
            c2  = comb[1]
            sim = self.tf_idf_matrix.loc[c1]@self.tf_idf_matrix.loc[c2]/(np.linalg.norm(self.tf_idf_matrix.loc[c1])\
                                                                        *np.linalg.norm(self.tf_idf_matrix.loc[c2]))
            sim = np.round(sim, self.rounding_digit)
            display(Eq(Symbol(f'sim({c1},{c2})'), Symbol(f'{c1}.{c2}')/Symbol(f'|{c1}||{c2}|')))
            display(Eq(Symbol(f'sim({c1},{c2})'), sim))
            if max_sim < sim:
                max_sim = sim
                max_doc = (c1, c2)
                                                                         
        print(f'Max similarity is {max_sim} between documents {max_doc}')     
        

In [7]:
doc1 = 'central park in USA'
doc2 = 'city park in Bangalore city'
doc3 = 'central in Chennai city'
doc4 = 'USA consulate in Chennai'
doc5 = 'Bangalore Chennai Mumbai'
#TF_IDF( D1 = doc1, D2 = doc2, D3 = doc3,D4 = doc4, D5 = doc5, method = 'alternate', rounding_digit = 3).calculate()
TF_IDF( D1 = doc1, D2 = doc2, D3 = doc3,D4 = doc4, D5 = doc5, method = 'Cornell', rounding_digit = 3).calculate()

Term Frequency Matrix:


Unnamed: 0,central,park,in,USA,city,Bangalore,Chennai,consulate,Mumbai
D1,1,1,1,1,0,0,0,0,0
D2,0,1,1,0,2,1,0,0,0
D3,1,0,1,0,1,0,1,0,0
D4,0,0,1,1,0,0,1,1,0
D5,0,0,0,0,0,1,1,0,1



Initiating TF-TDF calculation using Cornell method

TF(Term frequency) is computed using the equation:


TF(d,t) = 0

if freq(d,t) = 0



TF(d,t) = 1+log(1+log(freq(d,t)))

otherwise


IDF(Inverse document frequency) is computed using the equation:


Eq(IDF(t), log(1+|d|/|d_{t}|))

where d is the document collection, and dₜ is the set of documents containing                 
term t. If |dₜ| << |d|, the term t will have a large IDF scaling factor



Document D1:

Word: central


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: park


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: in


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/4))

Eq(IDF(t), 0.176)

TF*IDF = 1.0*0.176 = 0.176


Word: USA


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: city


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: Bangalore


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: Chennai


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/3))

Eq(IDF(t), 0.301)

TF*IDF = 0*0.301 = 0.0


Word: consulate


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0


Word: Mumbai


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0

-------------------------------------------

Document D2:

Word: central


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: park


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: in


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/4))

Eq(IDF(t), 0.176)

TF*IDF = 1.0*0.176 = 0.176


Word: USA


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: city


Eq(freq(d,t), 2)

TF(d,t) = 1+log(1+log(2)) = 1.114

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.114*0.477 = 0.531


Word: Bangalore


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: Chennai


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/3))

Eq(IDF(t), 0.301)

TF*IDF = 0*0.301 = 0.0


Word: consulate


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0


Word: Mumbai


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0

-------------------------------------------

Document D3:

Word: central


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: park


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: in


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/4))

Eq(IDF(t), 0.176)

TF*IDF = 1.0*0.176 = 0.176


Word: USA


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: city


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: Bangalore


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: Chennai


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/3))

Eq(IDF(t), 0.301)

TF*IDF = 1.0*0.301 = 0.301


Word: consulate


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0


Word: Mumbai


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0

-------------------------------------------

Document D4:

Word: central


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: park


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: in


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/4))

Eq(IDF(t), 0.176)

TF*IDF = 1.0*0.176 = 0.176


Word: USA


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: city


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: Bangalore


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: Chennai


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/3))

Eq(IDF(t), 0.301)

TF*IDF = 1.0*0.301 = 0.301


Word: consulate


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 1.0*0.778 = 0.778


Word: Mumbai


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0

-------------------------------------------

Document D5:

Word: central


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: park


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: in


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/4))

Eq(IDF(t), 0.176)

TF*IDF = 0*0.176 = 0.0


Word: USA


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: city


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 0*0.477 = 0.0


Word: Bangalore


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/2))

Eq(IDF(t), 0.477)

TF*IDF = 1.0*0.477 = 0.477


Word: Chennai


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/3))

Eq(IDF(t), 0.301)

TF*IDF = 1.0*0.301 = 0.301


Word: consulate


Eq(freq(d,t), 0)

Eq(TF(d,t), 0)

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 0*0.778 = 0.0


Word: Mumbai


Eq(freq(d,t), 1)

TF(d,t) = 1+log(1+log(1)) = 1.0

Eq(IDF(t), log(1+5/1))

Eq(IDF(t), 0.778)

TF*IDF = 1.0*0.778 = 0.778

-------------------------------------------

TF-IDF Matrix:


Unnamed: 0,central,park,in,USA,city,Bangalore,Chennai,consulate,Mumbai
D1,0.477,0.477,0.176,0.477,0.0,0.0,0.0,0.0,0.0
D2,0.0,0.477,0.176,0.0,0.531,0.477,0.0,0.0,0.0
D3,0.477,0.0,0.176,0.0,0.477,0.0,0.301,0.0,0.0
D4,0.0,0.0,0.176,0.477,0.0,0.0,0.301,0.778,0.0
D5,0.0,0.0,0.0,0.0,0.0,0.477,0.301,0.0,0.778



 Document similarity:


Eq(sim(D1,D2), D1.D2/|D1||D2|)

Eq(sim(D1,D2), 0.349)

Eq(sim(D1,D3), D1.D3/|D1||D3|)

Eq(sim(D1,D3), 0.403)

Eq(sim(D1,D4), D1.D4/|D1||D4|)

Eq(sim(D1,D4), 0.313)

Eq(sim(D1,D5), D1.D5/|D1||D5|)

Eq(sim(D1,D5), 0.0)

Eq(sim(D2,D3), D2.D3/|D2||D3|)

Eq(sim(D2,D3), 0.427)

Eq(sim(D2,D4), D2.D4/|D2||D4|)

Eq(sim(D2,D4), 0.036)

Eq(sim(D2,D5), D2.D5/|D2||D5|)

Eq(sim(D2,D5), 0.27)

Eq(sim(D3,D4), D3.D4/|D3||D4|)

Eq(sim(D3,D4), 0.164)

Eq(sim(D3,D5), D3.D5/|D3||D5|)

Eq(sim(D3,D5), 0.124)

Eq(sim(D4,D5), D4.D5/|D4||D5|)

Eq(sim(D4,D5), 0.097)

Max similarity is 0.427 between documents ('D2', 'D3')
