In [158]:
import pandas as pd
from collections import Counter
import math

In [159]:
class Preprocessor(object):
    def __init__(self):
        self.book_content = None
        
    def __str__(self):
        print(self.book_content)
    
    def clean(self):
        if not self.book_content:
            return 1
        else:
            total_corpus = []
            for char in self.book_content:
                if char in ['-','_', "\n", '—']:
                    total_corpus.append(' ')
                elif char not in [',','.','/','?','<','>',':',';','[',']','{','}',"\\",'"',"”", "“", "“"\
                                  "!","@","#","$","%","^","&","*","(",")", "'", "`", "’", "‘", "!", "£"]:
                     total_corpus.append(char.lower())
            print('Total Character : %d'%len(total_corpus))
            return ''.join(total_corpus) 
            
    def read(self, text_name):
        self.book_content = open(text_name, encoding='utf-8').read()

In [160]:
class WordAnalyzer(object):
    def __init__(self):
        self.word_count = None
    
    def __str__(self):
        final_str = ''
        for key, val in self.word_count.items():
            final_str = final_str + '%s : %d\n'%(key, val)
            
    def analyse_words(self, book_text):
        self.word_count = Counter([word.strip() for word in book_text.split(' ') if word.strip()])
    
    def get_word_frequency(self):
        freq_dictionary = {}
        total_words = sum(list(zip(*self.word_count.items()))[1])
        for key, val in self.word_count.items():
            freq_dictionary[key] = val / total_words
        return freq_dictionary

In [161]:
class IDF(object):
    def __init__(self):
        self.data = pd.DataFrame([])
        
    def load_frequency(self, book_frequency, book_title):
        if isinstance(self.data, pd.DataFrame):
            temp = self.data.T
            temp2 = pd.DataFrame([list(book_frequency.values())], columns = book_frequency.keys())
            temp2['index'] = [book_title]
            temp2 = temp2.set_index('index')
            print(temp2.shape)
            out = temp.join(temp2.T, how='outer')
            self.data = out.T
            print(self.data.shape)
        else:
            data = pd.DataFrame([list(book_frequency.values())], columns = book_frequency.keys())
            data['index'] = [book_title]
            self.data = data.set_index('index')
            print(self.data.shape)
        
    def getIDF(self, term):
        D = self.data.shape[0]
        #print(D)
        N = self.data[[term]].dropna(how='any').shape[0]
        #print(N)
        idf = 1 + math.log(D / (1+N))
        return idf

In [162]:
!ls

11-0.txt    84-0.txt		    EAadhaar.pdf		   Python
1342-0.txt  book_analyzer.ipynb     Links			   TCS
1661-0.txt  chromedriver	    pg16328.txt
1952-0.txt  dataset_websites-3.csv  phishing-url-prediction.ipynb


In [163]:
idf = IDF()
for file_name in ['11-0.txt','1342-0.txt','1661-0.txt','1952-0.txt', '84-0.txt']:
    processor = Preprocessor()
    processor.read(file_name)
    cleaned_content = processor.clean()
    #print(cleaned_content)
    word_analyzer = WordAnalyzer()
    word_analyzer.analyse_words(cleaned_content)
    freq_dict = word_analyzer.get_word_frequency()
    #print(freq_dict)
    idf.load_frequency(freq_dict, file_name)

Total Character : 155646
(1, 3115)
(1, 3115)
Total Character : 681391
(1, 6691)
(2, 7715)
Total Character : 559550
(1, 8297)
(3, 11759)
Total Character : 49296
(1, 1794)
(4, 11959)
Total Character : 429861
(1, 7376)
(5, 14192)


In [164]:
idf.data.head()

Unnamed: 0,0txt,0zip,1,10,100,1000,10th,11,1100,1115,...,youths,youve,zeal,zealand,zero,zest,zigzag,﻿,﻿project,﻿the
11-0.txt,3.4e-05,3.4e-05,6.7e-05,,,,,0.000101,,,...,,0.000235,,3.4e-05,,,3.4e-05,,3.4e-05,
1342-0.txt,8e-06,8e-06,2.4e-05,1.6e-05,,,,8e-06,,,...,8e-06,,,,,,,,,8e-06
1661-0.txt,9e-06,9e-06,2.8e-05,2.8e-05,2.8e-05,7.4e-05,9e-06,,9e-06,2.8e-05,...,,2.8e-05,,9e-06,1.8e-05,9e-06,9e-06,,,
1952-0.txt,0.000108,0.000108,0.000216,,,,,,,,...,,,,,,,,,,
84-0.txt,1.3e-05,1.3e-05,7.7e-05,2.6e-05,,,,2.6e-05,,,...,,,5.1e-05,,,,,1.3e-05,,


In [165]:
def choice(term, documents):
    all_docs_with_term = documents.data[[term]].dropna(how='any')
    #print(all_docs_with_term)
    idf_val = documents.getIDF(term)
    print('IDF Val : %s'%str(idf_val))
    highest_doc, highest_tf_idf = None, 0
    for file_name, tf in zip(all_docs_with_term.index, all_docs_with_term[term]):
        #print(tf)
        tf_idf = tf*idf_val
        if tf_idf > highest_tf_idf:
            highest_tf_idf = tf_idf
            highest_doc = file_name
    return highest_doc, highest_tf_idf

In [166]:
choice('announce', idf)

IDF Val : 1.2231435513142097


('1342-0.txt', 3.905217950764939e-05)