In [65]:
import pandas as pd
import numpy as np
import string
import os
import re
import pickle
from sortedcontainers import SortedDict, SortedList, SortedSet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
import operator

In [66]:
def getListOfFiles(directory):
    '''
    Parameters:
        directory: type(string)
        
    returns: list of all files in directory with the full path of file
    '''
    
    list_of_files = []
    
    for file_path in os.listdir(directory):
        full_path = os.path.join(directory, file_path)
        if os.path.isfile(full_path):
            list_of_files.append(full_path)
    
    return list_of_files

In [67]:
def lowercase(data):
    '''
    Parameters:
        data: type(string)
    
    returns: lowercase of data
    '''
    
    return data.lower()

In [68]:
def perform_word_tokenize(corpus):
    '''
    Parameters:
        corpus: type(string)
    
    returns word-level tokenization of corpus
    '''
    
    return word_tokenize(corpus)

In [69]:
def remove_stopwords_from_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
        stopwords_set: type(set)
    
    returns: tokens without stopwords
    '''
    stopwords_set = set(stopwords.words('english'))
    tokens_sans_stopwords = [x for x in tokens if x not in stopwords_set]
    
    return tokens_sans_stopwords

In [70]:
def remove_punctuation_from_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without punctuation
    '''
    tokens_sans_punctuation = [x.translate(str.maketrans('', '', string.punctuation)) for x in tokens]
    
    return tokens_sans_punctuation

In [71]:
def remove_blank_space_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without blank tokens
    '''
    tokens_sans_blank_space = [x for x in tokens if x!='']
    
    return tokens_sans_blank_space

In [72]:
def preprocess(corpus):
    # Convert the text to lower case
    lowercase_corpus = lowercase(corpus)
    #print(len(lowercase_corpus))
    
    # Perform word tokenization (word_tokenize also takes care of whitespace)
    word_tokens = perform_word_tokenize(lowercase_corpus)
    #print(len(word_tokens))
    
    # Remove stopwords from tokens
    word_tokens_sans_stopwords = remove_stopwords_from_tokens(word_tokens)
    #print(len(word_tokens_sans_stopwords))
    
    # Remove punctuation marks from tokens
    word_tokens_sans_punctuation = remove_punctuation_from_tokens(word_tokens_sans_stopwords)
    #print(len(word_tokens_sans_punctuation))
    
    # Remove blank space tokens
    word_tokens_sans_blank_tokens = remove_blank_space_tokens(word_tokens_sans_punctuation)
    #print(len(word_tokens_sans_blank_tokens))
    
    return set(word_tokens_sans_blank_tokens)

In [73]:
# Get List of Files in Dataset
list_of_files = getListOfFiles("C:/Users/kadia/OneDrive/Desktop/IIITD/II Semester/IR/Assignment 1/Humor,Hist,Media,Food/Humor,Hist,Media,Food")
#print(list_of_files)

In [74]:
len(list_of_files)

1137

In [75]:
process = []
for i,filePath in enumerate (list_of_files):
        file = open(filePath, encoding="utf8", errors = "ignore")
        read = file.read()    
        file.close()
        
        sanitized_query = preprocess(read)  
        process.append(sanitized_query)

In [76]:
natural_term_frequency = []
for item in process:
    dictionary_tf ={}
    
    for word in item:
        if dictionary_tf.get(word) == None:
            dictionary_tf[word] = 1
        else:
            count = dictionary_tf.get(word)
            count += 1
            dictionary_tf[word] = count
    
    natural_term_frequency.append(dictionary_tf)

In [89]:
# Logarithmic Term Frequency = 1 + log(tf)

logarithmic_term_frequency = []

for item in natural_term_frequency:
    dictionary_ltf = {}
    allkeys = item.keys()
    
    for word in allkeys:
        natural_freq = item.get(word)
        log_freq = 1 + np.log(natural_freq)
        dictionary_ltf[word] = log_freq
        
    logarithmic_term_frequency.append(dictionary_ltf)
        

In [90]:
document_frequency = {}

for item in process:
    
    unique_terms = list(set(item))
    
    for word in unique_terms:
        
        if document_frequency.get(word) == None:
            document_frequency[word] = 1
        else:
            count = document_frequency.get(word)
            count += 1
            document_frequency[word] = count

In [91]:
print(len(document_frequency))
N = len(list_of_files)
print('Total Number of documents are',N)

83917
Total Number of documents are 1137


In [92]:
inverse_document_frequency = {}

keys_df = document_frequency.keys()

for item in keys_df:
    
    getvalue = document_frequency[item]
    in_doc_freq = np.log(5/getvalue)
    inverse_document_frequency[item] = in_doc_freq

In [93]:
list_return = []

for item in natural_term_frequency:
    dict_inside = {}
    allkeys = item.keys()
    
    for word in allkeys:
        current_freq = item[word]
        idf_for_word = inverse_document_frequency[word]
        update_freq = current_freq * idf_for_word
        dict_inside[word] = update_freq
    
    list_return.append(dict_inside)

In [94]:
#Function block for calculating TF-IDF

def calculate_tfidf(list_tf):
    list_return = []
    
    for item in list_tf:
        dict_inside = {}
        allkeys = item.keys()
        
        for word in allkeys:
            current_freq = item[word]
            idf_for_word = inverse_document_frequency[word]
            update_freq = current_freq * idf_for_word
            dict_inside[word] = update_freq
    
        list_return.append(dict_inside)
    # print(list_return)  
    return list_return

In [103]:
tfidf_n = calculate_tfidf(natural_term_frequency)
tfidf_log = calculate_tfidf(logarithmic_term_frequency)

In [104]:
def getRankdoc(variant_list,query_list):
    
    doc_rank = {}
    for i in range(N):
        doc_rank[i] = 0.0
        
    for query_word in query_list:
        count = -1
        for item in variant_list:
            count += 1
            if query_word in item :
                score = item[query_word]
                doc_rank[count] += score
                
    doc_rank = dict(sorted(doc_rank.items(),key=operator.itemgetter(1),reverse=True))   
    # print(doc_rank)        
    
    return doc_rank

In [105]:
N

1137

In [106]:
def topk(list_tfidf,query_list,k,string):
    
    
    list_tf = getRankdoc(list_tfidf,query_list)
    count = -1
    flag = 0
    list_append = []
    for item in list_tf.keys():
        count += 1
        if count == k:
            print('Top ',k,' Documents based on ',string,' tf-idf are',list_append)
            for i in list_append:
                print(list_of_files[i])
#                 print(titles[i])
            flag = 1
            break
        list_append.append(item)
        print(list_append)
    
    if flag == 0:
        print('Top ',k,' Documents based on ',string,' tf-idf are:',list_append)    
        for i in list_append:
            print(list_of_files[i])
            # print(titles[i])
            
    print()  

In [108]:
def execute_two(query,k):
    query_list =  preprocess(query)
   # query_list = query_afterPreprocess.split(" ")
    # query_list.remove('')
    # print(tfidf_n)
    topk(tfidf_n,query_list,k,'Natural')
    topk(tfidf_log,query_list,k,'Logarithmic')

In [109]:
getInputtwo = input('Enter the String   : ')
k = int(input('Enter how many closest matches required  : '))
print()
execute_two(getInputtwo,k)

Enter the String   : daily guests
Enter how many closest matches required  : 5

[1]
[1, 2]
[1, 2, 4]
[1, 2, 4, 5]
[1, 2, 4, 5, 8]
Top  5  Documents based on  Natural  tf-idf are [1, 2, 4, 5, 8]
C:/Users/kadia/OneDrive/Desktop/IIITD/II Semester/IR/Assignment 1/Humor,Hist,Media,Food/Humor,Hist,Media,Food\a-team
C:/Users/kadia/OneDrive/Desktop/IIITD/II Semester/IR/Assignment 1/Humor,Hist,Media,Food/Humor,Hist,Media,Food\abbott.txt
C:/Users/kadia/OneDrive/Desktop/IIITD/II Semester/IR/Assignment 1/Humor,Hist,Media,Food/Humor,Hist,Media,Food\acetab1.txt
C:/Users/kadia/OneDrive/Desktop/IIITD/II Semester/IR/Assignment 1/Humor,Hist,Media,Food/Humor,Hist,Media,Food\aclamt.txt
C:/Users/kadia/OneDrive/Desktop/IIITD/II Semester/IR/Assignment 1/Humor,Hist,Media,Food/Humor,Hist,Media,Food\acronym.txt

[1]
[1, 2]
[1, 2, 4]
[1, 2, 4, 5]
[1, 2, 4, 5, 8]
Top  5  Documents based on  Logarithmic  tf-idf are [1, 2, 4, 5, 8]
C:/Users/kadia/OneDrive/Desktop/IIITD/II Semester/IR/Assignment 1/Humor,Hist,Media,F

In [None]:
#daily music television show with guests