# Implementing TFIDF 

In [54]:
import numpy as np
from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from math import log10
from sklearn.preprocessing import normalize
import math

In [26]:
def fit(dataset):
    """
    input is list of reviews and output is dictionary with value for each key as its index in unique_words
    """
    unique_words =set() # creating a empty set of unique_words
    if isinstance(dataset, (list,)): # if the input is a list
        for row in dataset: # for each row in dataset
            for word in row.split(): # tokenizing each word in document
                if len(word) < 2: # we will consider a word if its length is greater tham 1
                    continue
                unique_words.add(word) # inserting the word in the set
        unique_words = sorted(list(unique_words)) # sorting the list of unique words
        vocab = {j:i for i, j in enumerate(unique_words)} # creating a dictionary with value for each key its index
        return vocab # returning the vocab
    else:
        print("Enter the list of Documents")

In [27]:
def DF(dataset,word):
    """
    input is the list of document and word for which we the function will return the Document frequencny , i.e. the number of ducuments
    in which the word occurs
    """
    if isinstance(dataset,(list,)):
        count= 0 # initializing the count with 0
        for row in dataset: # for each row in the dataset
            if word in row: # if the word in there in the row
                count = count+1 # increment the count
        return count
    else:
        print("Enter the list of documents")
def transform(dataset, vocab):
    """
     dataset-> list of strings
     vocab: dictionay with having unique words as key and the index as their count
     return-> the sparse matrix
    """
    columns = []  # creating a list to store the column index
    rows = [] # creating a list to store the row inde
    values = [] # creating the list to store the TFIDF value for each word in document
    N = len(dataset) # storing the number of documents
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(dataset): # for each row in list
            word_freq = dict(Counter(row.split())) # creating a dictionary of word frequency of each list
            for word, freq in word_freq.items(): # for each item in the list
                if len(word) <2 : # considering those items whose len is greater than 1
                    continue
                column_index = vocab.get(word, -1) # getting the column index for each word
                if column_index!=-1: # if the word exist in the vocab
                    df = DF(dataset,word) # getting the document frequency for each word
                    tfidf = (freq/len(row.split()))*(1+np.log(1+N/1+df)) # calculating the TFIDF value for each word
                    columns.append(column_index) # storing index for the word
                    rows.append(idx) # storing the row number of the word
                    values.append(tfidf) # appending the tfidf value of the word
    return normalize(csr_matrix((values,(rows, columns)), shape=(len(dataset),len(vocab)))) # returing the nomalized sparse matrix
    
            


In [28]:
strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
vocab = fit(strings)
print(list(vocab.keys()))
print(transform(strings, vocab).toarray())

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']
[[0.         0.         0.         0.24715587 0.27026754 0.27026754
  0.         0.24715587 0.24715587 0.24715587 0.27026754 0.24715587
  0.         0.24715587 0.24715587 0.         0.         0.54053508
  0.         0.         0.         0.24715587]
 [0.27128981 0.27128981 0.27128981 0.         0.         0.29665826
  0.27128981 0.         0.         0.         0.29665826 0.
  0.27128981 0.         0.         0.27128981 0.27128981 0.29665826
  0.27128981 0.27128981 0.27128981 0.        ]]


## Comparing results with TFIDF vectorizer

In [29]:
tf = TfidfVectorizer()
new_string = tf.fit_transform(strings)
print(tf.get_feature_names())
print(new_string.toarray())

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']
[[0.         0.         0.         0.28822557 0.28822557 0.20507486
  0.         0.28822557 0.28822557 0.28822557 0.20507486 0.28822557
  0.         0.28822557 0.28822557 0.         0.         0.41014973
  0.         0.         0.         0.28822557]
 [0.29464404 0.29464404 0.29464404 0.         0.         0.20964166
  0.29464404 0.         0.         0.         0.20964166 0.
  0.29464404 0.         0.         0.29464404 0.29464404 0.20964166
  0.29464404 0.29464404 0.29464404 0.        ]]


# Consider only n words with top IDF scores

In [52]:
def idf_new(dataset, word):
    """
    input-> dataset, word
    dataset->list of sentences
    word-> word for which i have to find the IDF value
    """
    count = 0 # initialzing the coount with zero
    for row in dataset: # for each row in the list
        if word in row: # if the word is present in the list
            count += 1 # increament the count
    val =1+ np.log((1+len(dataset)/(count+1))) #calculating the IDF value
    return val # returng the IDF value




def df_new(dataset, word):
    count = 0
    for row in dataset:
        if word in dataset:
            count+=1
    return count


def fit_new(dataset, n):
    """
    input-> dataset ,n
    dataset= list of strings
    n- number of top words based on IDF values
    return the vocab having top n words based on IDF values
    """
    unique_words = set() # creating a set of unique words
    tdf_values  = [] # creatng a list to store the values ofIDF for each word
    for row in dataset: # for each row in the dataset
        for word in row.split(): # for each word in the row
            if len(word)<2: # we will consider only those words for which the len is geater than 1
                continue
            unique_words.add(word) # inserting the word in the set
    unique_words = list(unique_words)
    
    #sorting the list of unique_words based on IDF values
    unique_words= sorted(unique_words, key = lambda x: idf_new(dataset, x), reverse=True)[:n]
    vocab = {j:i for i,j in enumerate(unique_words)}
    return vocab
    
    
def transform_new(dataset, vocab):
    """
     dataset-> list of strings
     vocab: dictionay with having unique words as key and the index as their count
     return-> the sparse matrix
    """
    columns = []  # creating a list to store the column index
    rows = [] # creating a list to store the row inde
    values = [] # creating the list to store the TFIDF value for each word in document
    N = len(dataset) # storing the number of documents
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(dataset): # for each row in list
            word_freq = dict(Counter(row.split())) # creating a dictionary of word frequency of each list
            for word, freq in word_freq.items(): # for each item in the list
                if len(word) <2 : # considering those items whose len is greater than 1
                    continue
                column_index = vocab.get(word, -1) # getting the column index for each word
                if column_index!=-1: # if the word exist in the vocab
                    df = df_new(dataset,word) # getting the document frequency for each word
                    tfidf = (freq/len(row.split()))*(1+np.log(1+N/1+df)) # calculating the TFIDF value for each word
                    columns.append(column_index) # storing index for the word
                    rows.append(idx) # storing the row number of the word
                    values.append(tfidf) # appending the tfidf value of the word
    return normalize(csr_matrix((values,(rows, columns)), shape=(len(dataset),len(vocab)))) # returing the nomalized sparse matrix
        

In [53]:
strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
vocab = fit_new(strings,10)
print(list(vocab.keys()))
print(transform_new(strings, vocab).toarray())

['solving', 'method', 'economists', 'poorly', 'centerpiece', 'unfortunately', 'taught', 'workhorse', 'its', 'but']
[[0.5        0.5        0.5        0.         0.         0.
  0.         0.5        0.         0.        ]
 [0.         0.         0.         0.40824829 0.40824829 0.40824829
  0.40824829 0.         0.40824829 0.40824829]]
