In [1]:
import pandas as pd
from collections import Counter
import math

In [2]:
class Preprocessor(object):
    def __init__(self):
        self.book_content = None
        
    def __str__(self):
        return self.book_content
    
    def clean(self):
        ## Below list let us keep only alphabets and numbers and spacer
        chars_numbers_space_tab = ['a','b','c','d','e','f','g','h','i','j','k','l','m',\
                                    'n','o','p','q','r','s','t','u','v','w','x','y','z',\
                                    '0','1','2','3','4','5','6','7','8','9', ' ', '\t']
        if not self.book_content:
            return 1
        else:
            total_corpus = []
            for char in self.book_content:
                if char in ['-','_', "\n", '—']: # If character is new line or dash or underscore then we add space
                    total_corpus.append(' ')
                elif char.lower() in chars_numbers_space_tab: ## We take only alphabets, numbers and space
                     total_corpus.append(char.lower())
            print('Total Character : %d'%len(total_corpus))
            return ''.join(total_corpus) 
            
    def read(self, text_name):
        self.book_content = open(text_name, encoding='utf-8').read()

In [3]:
class WordAnalyzer(object):
    def __init__(self):
        self.word_count = None
    
    def __str__(self):
        final_str = ''
        for key, val in self.word_count.items(): # @3 create big string with new line after each key: val
            final_str = final_str + '%s : %d\n'%(key, val)
        return final_str
            
    def analyse_words(self, book_text):
        # We only keep string which has some characters. Counter returns dictionary with count of each word as value and word as key.
        self.word_count = Counter([word.strip() for word in book_text.split(' ') if word.strip()]) 
    
    def get_word_frequency(self):
        freq_dictionary = {}
        total_words = sum(list(zip(*self.word_count.items()))[1]) ## We get count of total words in document
        for key, val in self.word_count.items(): 
            freq_dictionary[key] = val / total_words # Calculating freq
        return freq_dictionary

In [4]:
class IDF(object):
    def __init__(self):
        self.data = pd.DataFrame([])
        
    def load_frequency(self, book_frequency, book_title):
        if isinstance(self.data, pd.DataFrame):
            temp = self.data.T # We take transpose of original dataframe so words becomes index
            temp2 = pd.DataFrame([list(book_frequency.values())], columns = book_frequency.keys()) # We create new data frame for new book
            temp2['index'] = [book_title]
            temp2 = temp2.set_index('index')
            print(book_title + ' : ' + str(temp2.shape))
            out = temp.join(temp2.T, how='outer') ## We join both dataframe with outer join so that all words from both are kept with values
            self.data = out.T ## We then do transpose again to put word as column again.
            print('Total DF Shape : ',self.data.shape)
        else:
            data = pd.DataFrame([list(book_frequency.values())], columns = book_frequency.keys())
            data['index'] = [book_title]
            self.data = data.set_index('index')
            print(book_title + ' : ',str(self.data.shape))
        
    def getIDF(self, term):
        D = self.data.shape[0]
        #print(D)
        N = self.data[[term]].dropna(how='any').shape[0]
        #print(N)
        idf = 1 + math.log(D / (1+N))
        return idf

In [5]:
!ls

11-0_clean.txt	   1342-0.txt	      1952-0_freq.txt	   Book_analyzer.pdf
11-0_counts.txt    1661-0_clean.txt   1952-0.txt	   IDF.csv
11-0_freq.txt	   1661-0_counts.txt  84-0_clean.txt	   pg16328_clean.txt
11-0.txt	   1661-0_freq.txt    84-0_counts.txt	   pg16328_counts.txt
1342-0_clean.txt   1661-0.txt	      84-0_freq.txt	   pg16328_freq.txt
1342-0_counts.txt  1952-0_clean.txt   84-0.txt		   pg16328.txt
1342-0_freq.txt    1952-0_counts.txt  book_analyzer.ipynb  word_analyzer.py


In [61]:
idf = IDF()
for file_name in ['11-0.txt', '84-0.txt', '1342-0.txt', '1661-0.txt','1952-0.txt', 'pg16328.txt' ]:
    processor = Preprocessor()
    processor.read(file_name)
    cleaned_content = processor.clean()
    with open(file_name.split('.')[0] + '_clean.txt','w') as f:
        f.write(cleaned_content)
    word_analyzer = WordAnalyzer()
    word_analyzer.analyse_words(cleaned_content)
    with open(file_name.split('.')[0] + '_counts.txt','w') as f:
        f.write(str(word_analyzer))
    freq_dict = word_analyzer.get_word_frequency()
    with open(file_name.split('.')[0] + '_freq.txt','w') as f:
        f.write(str(freq_dict))
    idf.load_frequency(freq_dict, file_name)

Total Character : 155645
11-0.txt : (1, 3114)
Total DF Shape :  (1, 3114)
Total Character : 429829
84-0.txt : (1, 7375)
Total DF Shape :  (2, 8389)
Total Character : 681390
1342-0.txt : (1, 6690)
Total DF Shape :  (3, 10849)
Total Character : 559526
1661-0.txt : (1, 8295)
Total DF Shape :  (4, 14016)
Total Character : 49296
1952-0.txt : (1, 1794)
Total DF Shape :  (5, 14186)
Total Character : 279556
pg16328.txt : (1, 6413)
Total DF Shape :  (6, 16863)


In [62]:
idf.data

Unnamed: 0,0txt,0zip,1,10,100,1000,1005,1009,101,102,...,yrmenlaf,yrmenlafs,zeal,zealand,zero,zest,zeugma,zigzag,zinsser,zrich
11-0.txt,3.4e-05,3.4e-05,6.7e-05,,,,,,,,...,,,,3.4e-05,,,,3.4e-05,,
84-0.txt,1.3e-05,1.3e-05,7.7e-05,2.6e-05,,,,,,,...,,,5.1e-05,,,,,,,
1342-0.txt,8e-06,8e-06,2.4e-05,1.6e-05,,,,,,,...,,,,,,,,,,
1661-0.txt,9e-06,9e-06,2.8e-05,2.8e-05,2.8e-05,7.4e-05,,,,,...,,,,9e-06,1.8e-05,9e-06,,9e-06,,
1952-0.txt,0.000108,0.000108,0.000216,,,,,,,,...,,,,,,,,,,
pg16328.txt,,,0.001964,0.001219,0.000113,,2.3e-05,2.3e-05,2.3e-05,2.3e-05,...,4.5e-05,2.3e-05,,,,,2.3e-05,,2.3e-05,2.3e-05


In [63]:
def choice(term, documents):
    highest_doc, highest_tf_idf = None, 0
    if term in documents.data.columns:
        all_docs_with_term = documents.data[[term]].dropna(how='any')
        #print(all_docs_with_term)
        idf_val = documents.getIDF(term)
        print('IDF Val : %s'%str(idf_val))
        for file_name, tf in zip(all_docs_with_term.index, all_docs_with_term[term]):
            #print(tf)
            tf_idf = tf*idf_val
            if tf_idf > highest_tf_idf:
                highest_tf_idf = tf_idf
                highest_doc = file_name
    return highest_doc, highest_tf_idf

In [64]:
choice('abcs', idf)

(None, 0)

In [65]:
choice('announce', idf)

IDF Val : 1.4054651081081644


('1342-0.txt', 4.4873290330153794e-05)

In [66]:
idf.data.to_csv('idf.txt', sep=' ', na_rep='NaN', float_format="%.5f", line_terminator='/', index_label=idf.data.index)

In [67]:
((49 + 32)* 32)*4

10368

In [68]:
with open('idf.txt','w') as f:
    f.write(idf.data.to_string(justify='right' , line_width=80))

In [69]:
!cat idf.txt

                 0txt      0zip         1        10       100      1000  \
11-0.txt     0.000034  0.000034  0.000067       NaN       NaN       NaN   
84-0.txt     0.000013  0.000013  0.000077  0.000026       NaN       NaN   
1342-0.txt   0.000008  0.000008  0.000024  0.000016       NaN       NaN   
1661-0.txt   0.000009  0.000009  0.000028  0.000028  0.000028  0.000074   
1952-0.txt   0.000108  0.000108  0.000216       NaN       NaN       NaN   
pg16328.txt       NaN       NaN  0.001964  0.001219  0.000113       NaN   

                 1005      1009       101       102       103       104  \
11-0.txt          NaN       NaN       NaN       NaN       NaN       NaN   
84-0.txt          NaN       NaN       NaN       NaN       NaN       NaN   
1342-0.txt        NaN       NaN       NaN       NaN       NaN       NaN   
1661-0.txt        NaN       NaN       NaN       NaN       NaN       NaN   
1952-0.txt        NaN       NaN       NaN       NaN       NaN       NaN   
pg16328.tx

1342-0.txt        NaN       NaN       NaN       NaN       NaN       NaN   
1661-0.txt   0.000009  0.000018  0.000009       NaN  0.000018  0.000009   
1952-0.txt        NaN       NaN       NaN       NaN  0.000216       NaN   
pg16328.txt       NaN       NaN       NaN       NaN       NaN       NaN   

             crawling    crawls   crayons     crazy  creaking     cream  \
11-0.txt     0.000034       NaN       NaN  0.000034       NaN       NaN   
84-0.txt     0.000013       NaN       NaN       NaN  0.000013       NaN   
1342-0.txt        NaN       NaN  0.000008       NaN       NaN       NaN   
1661-0.txt        NaN       NaN       NaN       NaN  0.000009  0.000028   
1952-0.txt   0.000108  0.000108       NaN       NaN       NaN       NaN   
pg16328.txt       NaN       NaN       NaN       NaN       NaN       NaN   

              creases    create   created   creates  creating  creation  \
11-0.txt          NaN       NaN  0.000067       NaN  0.000134  0.000034   
84-0.txt

1661-0.txt    0.000074     0.000018      0.000009           NaN   0.000009   
1952-0.txt         NaN     0.000108           NaN           NaN        NaN   
pg16328.txt   0.000045          NaN           NaN           NaN        NaN   

             housemaids    houses     hovel    hovels     hover   hovered  \
11-0.txt            NaN  0.000034       NaN       NaN       NaN       NaN   
84-0.txt            NaN  0.000051  0.000179  0.000013  0.000013  0.000013   
1342-0.txt     0.000008  0.000032       NaN       NaN       NaN       NaN   
1661-0.txt          NaN  0.000120       NaN       NaN  0.000009       NaN   
1952-0.txt          NaN  0.000108       NaN       NaN       NaN       NaN   
pg16328.txt         NaN  0.000113       NaN       NaN       NaN       NaN   

             hovering    hovers       how   however      howl    howled  \
11-0.txt          NaN       NaN  0.002419  0.000706       NaN  0.000034   
84-0.txt          NaN  0.000013  0.001250  0.000421  0.000013

1661-0.txt     0.000009        NaN     0.000018  0.000129  0.000046  0.000037   
1952-0.txt          NaN        NaN     0.000216       NaN       NaN       NaN   
pg16328.txt         NaN   0.000045     0.000045  0.000113  0.000158       NaN   

             promising  promontory   promote  promoted  promoting  promotion  \
11-0.txt      0.000034         NaN       NaN       NaN   0.000067   0.000034   
84-0.txt      0.000013    0.000051       NaN       NaN   0.000026   0.000013   
1342-0.txt    0.000032         NaN  0.000032  0.000008   0.000016   0.000008   
1661-0.txt    0.000009         NaN       NaN       NaN   0.000018   0.000009   
1952-0.txt         NaN         NaN       NaN       NaN   0.000216   0.000108   
pg16328.txt        NaN    0.000068       NaN       NaN   0.000045   0.000023   

               prompt  prompted  promptly  promptness     prone  pronounce  \
11-0.txt          NaN       NaN       NaN         NaN       NaN        NaN   
84-0.txt          NaN    

1342-0.txt        NaN       NaN        NaN        NaN   0.000008        NaN   
1661-0.txt   0.000028       NaN        NaN        NaN   0.000055   0.000018   
1952-0.txt        NaN       NaN        NaN        NaN        NaN        NaN   
pg16328.txt  0.000068  0.000068   0.000023   0.000023        NaN        NaN   

             travellers  travelling   travels  traverse  traversed  \
11-0.txt            NaN         NaN       NaN       NaN        NaN   
84-0.txt       0.000038    0.000013  0.000051  0.000026   0.000077   
1342-0.txt     0.000032    0.000048       NaN       NaN        NaN   
1661-0.txt     0.000009    0.000018  0.000009       NaN        NaN   
1952-0.txt          NaN         NaN       NaN       NaN        NaN   
pg16328.txt    0.000023         NaN       NaN       NaN        NaN   

             traversing   travler      tray  treacherous  treacherously  \
11-0.txt            NaN       NaN  0.000034          NaN            NaN   
84-0.txt       0.000013    