In [1]:
import spacy
import pandas as pd
import math
from nltk.corpus import stopwords
import os
import glob
import codecs
nlp = spacy.load('en_core_web_lg')

# TF- IDF 

tf = (frequency of term in the doc/total number of terms in the doc) 

idf = ln(total number of docs/number of docs with term in it)


<b> S. Sareen and S. Sareen, “Process Text using TFIDF in Python,” Towards Data Science, 07-Aug-2018. [Online]. Available: https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8. [Accessed: 08-May-2019]. <b>




In [2]:
#using spacy to perform tokenization of the text
def extract_words(text,fname=""):
    stop_words = set(stopwords.words('english'))
    words = []
    doc = nlp(text)
    for token in doc:
        words.append(token.text)
    non_stop_sentence = [w for w in words if not w in stop_words] 
    return(non_stop_sentence)

In [3]:
#function to count words in the text
def count_words(text):
    w = extract_words(text)
    return (len(w))

In [4]:
corpus_words = []
path = input("Enter Document directory ")
if path == "":
    path = "<give some relevant default path>"
    
os.chdir(path)

for filename in glob.glob("*.txt"):
    f = codecs.open(filename,'r','ISO-8859-1')
    file = f.read()
    len_w = count_words(file)
    corpus_words.append({'doc':filename,'count':len_w,'text':file})

Enter Document directory 


In [5]:
def get_word_frequency(text,docname):
    word_freq_dict={}
    words = extract_words(text)
    for word in words:
        word = word.lower()
        if word in word_freq_dict:
            word_freq_dict[word] += 1
        else:
            word_freq_dict[word] = 1
    temp ={'doc' : docname , 'freq_dict': word_freq_dict}
    return temp

In [6]:
word_count_list = []
for i in corpus_words:
    l = get_word_frequency(i['text'],i['doc'])
    word_count_list.append(l)

In [7]:
def calculate_TF_score(corpus_words,word_count_list):
    TF_scores =[]
    for i in range(0,len(word_count_list)):
        docname = word_count_list[i]['doc']
        for k in word_count_list[i]['freq_dict']:
            temp = {'doc': docname,
                   'TF_score': word_count_list[i]['freq_dict'][k]/corpus_words[i]['count'],
                   'key':k}
            TF_scores.append(temp)
        i += 1
    return TF_scores

In [8]:
def calculate_IDF_score(corpus_words,word_count_list):
    IDF_scores =[]
    for dict in word_count_list:
        docname = dict['doc']
        for k in dict['freq_dict'].keys():
            count = sum([k in temp['freq_dict'] for temp in word_count_list])
            temp = {'doc': docname,'IDF_score': math.log(len(corpus_words)/count),'key' : k}
            
            IDF_scores.append(temp)
    return IDF_scores

In [9]:
tf = calculate_TF_score(corpus_words,word_count_list)

In [10]:
idf = calculate_IDF_score(corpus_words,word_count_list)

In [11]:
tf_dataframe = pd.DataFrame(tf)
idf_dataframe = pd.DataFrame(idf)

In [12]:
tf_dataframe.head(10)

Unnamed: 0,TF_score,doc,key
0,0.001374,AbrahamLincoln.txt,family
1,9.2e-05,AbrahamLincoln.txt,inspection
2,0.000183,AbrahamLincoln.txt,she
3,0.000183,AbrahamLincoln.txt,chicago
4,9.2e-05,AbrahamLincoln.txt,obvious
5,9.2e-05,AbrahamLincoln.txt,matteson
6,0.000825,AbrahamLincoln.txt,popular
7,9.2e-05,AbrahamLincoln.txt,touched
8,0.000183,AbrahamLincoln.txt,recalled
9,9.2e-05,AbrahamLincoln.txt,mill


In [13]:
idf_dataframe.head(10)

Unnamed: 0,IDF_score,doc,key
0,0.223144,AbrahamLincoln.txt,family
1,2.302585,AbrahamLincoln.txt,inspection
2,1.609438,AbrahamLincoln.txt,she
3,0.628609,AbrahamLincoln.txt,chicago
4,2.70805,AbrahamLincoln.txt,obvious
5,3.401197,AbrahamLincoln.txt,matteson
6,0.567984,AbrahamLincoln.txt,popular
7,2.70805,AbrahamLincoln.txt,touched
8,1.455287,AbrahamLincoln.txt,recalled
9,2.302585,AbrahamLincoln.txt,mill


In [14]:
tfidf = pd.merge(tf_dataframe, idf_dataframe)

In [15]:
tfidf['TFIDF_score'] = tfidf['TF_score']*tfidf['IDF_score']

In [16]:
tfidf.head()

Unnamed: 0,TF_score,doc,key,IDF_score,TFIDF_score
0,0.001374,AbrahamLincoln.txt,family,0.223144,0.000307
1,9.2e-05,AbrahamLincoln.txt,inspection,2.302585,0.000211
2,0.000183,AbrahamLincoln.txt,she,1.609438,0.000295
3,0.000183,AbrahamLincoln.txt,chicago,0.628609,0.000115
4,9.2e-05,AbrahamLincoln.txt,obvious,2.70805,0.000248


In [17]:
tfidf.shape

(73182, 5)

In [18]:
tfidf.set_index("key", inplace=True)

In [19]:
tfidf.head()

Unnamed: 0_level_0,TF_score,doc,IDF_score,TFIDF_score
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
family,0.001374,AbrahamLincoln.txt,0.223144,0.000307
inspection,9.2e-05,AbrahamLincoln.txt,2.302585,0.000211
she,0.000183,AbrahamLincoln.txt,1.609438,0.000295
chicago,0.000183,AbrahamLincoln.txt,0.628609,0.000115
obvious,9.2e-05,AbrahamLincoln.txt,2.70805,0.000248


In [20]:
#testing for the TF-IDF
m = "at&t"
if m in tfidf.index:
    print(tfidf.loc[m])

      TF_score                  doc  IDF_score  TFIDF_score
key                                                        
at&t  0.000384         AppleInc.txt   1.609438     0.000618
at&t  0.046399             AT_T.txt   1.609438     0.074677
at&t  0.000688           Dallas.txt   1.609438     0.001107
at&t  0.000417              IBM.txt   1.609438     0.000672
at&t  0.000261  Rchardson_Texas.txt   1.609438     0.000420
at&t  0.000074            Texas.txt   1.609438     0.000119
