# TF-IDF

##### Term Frequency (TF): Number of Times a Word Appears in a Document Divided by the Total Number of Words in the document (Unique to each document) - lower = rarity
##### Inverse Data Frequency (IDF): Log of the Number of Documents Divided by the Number of Documents that contain the respective word (determines the rarity of words across ALL documents) - lower = less important
##### TF-IDF : TF X IDF - higher = rarity 

In [1]:
#import packages
import pandas as pd
import numpy as np
import math
from ast import literal_eval

In [2]:
#Import Corpus and Tokenized Wordlist
corpus = pd.read_csv("../data/cleaned data/corpus.csv")
token = pd.read_csv("../data/cleaned data/tokenized.csv",converters={"tokenized": literal_eval})

corpus = corpus.drop("Unnamed: 0",axis=1)
token = token.drop("Unnamed: 0",axis=1)

#make sure to have a "set" of words for each row, instead of "list"
tokenized = [list(set(li)) for li in token['tokenized']]

In [3]:
#define term frequency (tf) function
def tf(corpus, token_set):
    tf_dict = {}
    n = len(token_set)
    row_dict = corpus

    for word, count in row_dict.items():
        tf_dict[word] = count / float(n)
    
    return tf_dict

#define inverse data frequency (idf) function
def idf(documents):
    n = len(documents)
    idf_dict = dict.fromkeys(documents[0].keys(),0)

    for document in documents:
        for word, val in document.items():
            if val > 0:
                idf_dict[word] += 1
        
    for word, val in idf_dict.items():
        idf_dict[word] = math.log(n / float(val))

    return idf_dict

#define tf-idf function
def tf_idf(tf, idf):
    tf_idf_dict = {}

    for word, val in tf.items():
        tf_idf_dict[word] = val * idf[word]

    return tf_idf_dict

#main function to execute all above
def main(corpus, tokenized):
    tf_li = []
    tf_idf_li = []
    
    documents = [corpus.iloc[i,:].to_dict() for i in range(corpus.shape[0])]

    for l, r in enumerate(documents):
        tf_temp = tf(r, tokenized[l])
        tf_li.append(tf_temp)
    
    idf_dict = idf(documents)

    for t in tf_li:
        tf_idf_temp = tf_idf(t, idf_dict)
        tf_idf_li.append(tf_idf_temp)

    return pd.DataFrame(tf_idf_li) , pd.DataFrame(tf_li) , pd.DataFrame(idf_dict, index=[0])


In [4]:
#run the main function to get the dataframes of tfidf, tf, idf
tf_idf_df, tf_df, idf_df= main(corpus, tokenized)

In [5]:
#check to see if the function worked correctly
idf_df.loc[idf_df['abandon'] >0]

Unnamed: 0,abandon,abandoned,abandoning,abandonment,abaut,abc,abd,abduct,abducted,abhorrance,...,zombified,zompire,zompires,zoning,zoom,zooming,zoso,zz,æons,èver
0,6.057954,4.186152,6.057954,6.751101,6.751101,6.057954,6.751101,6.751101,6.057954,6.751101,...,6.057954,6.751101,6.751101,6.751101,5.364807,6.751101,6.751101,6.751101,6.751101,6.751101


In [6]:
#showing top 10 tf_idf values for the 2nd dream
    #visualization methods to be considered
temp = tf_idf_df.iloc[1,:]
temp.sort_values(axis = 0, ascending = False)[:15]

figure             0.106467
blurry             0.100074
inhuman            0.077599
downcast           0.077599
lb                 0.077599
pump               0.077599
distinctiveness    0.077599
identify           0.069632
towering           0.069632
helpless           0.069632
pinned             0.064971
torso              0.064971
growl              0.064971
balance            0.061664
hint               0.057004
Name: 1, dtype: float64