# Import Statements

In [1]:
import pandas as pd
import numpy as np
import string
import os
import pickle
import random
import math
import heapq
import operator
from tqdm import tqdm

In [2]:
from sortedcontainers import SortedDict, SortedList, SortedSet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Read From Files

In [3]:
def getListOfFiles(directory):
    '''
    Parameters:
        directory: type(string)
        
    returns: list of all files in directory with the full path of file
    '''
    
    list_of_files = []
    
    for file_path in os.listdir(directory):
        full_path = os.path.join(directory, file_path)
        if os.path.isfile(full_path):
            list_of_files.append(full_path)
    
    return list_of_files

# Preprocessing Functions

In [4]:
def lowercase(data):
    '''
    Parameters:
        data: type(string)
    
    returns: lowercase of data
    '''
    
    return data.lower()

In [5]:
def perform_word_tokenize(corpus):
    '''
    Parameters:
        corpus: type(string)
    
    returns word-level tokenization of corpus
    '''
    
    return word_tokenize(corpus)

In [6]:
def remove_stopwords_from_tokens(tokens, stopwords_set):
    '''
    Parameters:
        tokens: type(list)
        stopwords_set: type(set)
    
    returns: tokens without stopwords
    '''
    tokens_sans_stopwords = [x for x in tokens if x not in stopwords_set]
    
    return tokens_sans_stopwords

In [7]:
def remove_punctuation_from_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without punctuation
    '''
    tokens_sans_punctuation = [x.translate(str.maketrans('', '', string.punctuation)) for x in tokens]
    
    return tokens_sans_punctuation

In [8]:
def remove_blank_space_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without blank tokens
    '''
    tokens_sans_blank_space = [x for x in tokens if x!='']
    
    return tokens_sans_blank_space

In [9]:
def stemming(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    retruns: tokens after stemming
    '''
    stemmer = PorterStemmer()
    stem_tokens = [stemmer.stem(x) for x in tokens]
    
    return stem_tokens

In [10]:
def preprocess(corpus, stopwords_set):
    # Convert the text to lower case
    lowercase_corpus = lowercase(corpus)
    #print(len(lowercase_corpus))
    
    # Perform word tokenization (word_tokenize also takes care of whitespace)
    word_tokens = perform_word_tokenize(lowercase_corpus)
    #print(len(word_tokens))
    
    # Remove stopwords from tokens
    word_tokens_sans_stopwords = remove_stopwords_from_tokens(word_tokens, stopwords_set)
    #print(len(word_tokens_sans_stopwords))
    
    # Remove punctuation marks from tokens
    word_tokens_sans_punctuation = remove_punctuation_from_tokens(word_tokens_sans_stopwords)
    #print(len(word_tokens_sans_punctuation))
    
    # Remove blank space tokens
    word_tokens_sans_blank_tokens = remove_blank_space_tokens(word_tokens_sans_punctuation)
    #print(len(word_tokens_sans_blank_tokens))
    
    # Stem tokens
    #word_tokens_final = stemming(word_tokens_sans_blank_tokens)
    
    return word_tokens_sans_blank_tokens

In [11]:
def preprocess_documents(list_of_files, stopwords_set):
    '''
    Parameters:
        list_of_files: type(list)
        stopwords_set: type(set)
    
    returns: list of tokens obtained by preprocessing documents in all classes
    '''
    preprocessed_list_of_docs_tokens = []
    for doc_path in list_of_files:
        file = open(doc_path, 'r', encoding='utf-8', errors='ignore')
        file_corpus = file.read()
        file.close()
        doc_tokens = preprocess(file_corpus, stopwords_set)
        preprocessed_list_of_docs_tokens.append(doc_tokens)
    
    pi_file = open('Q1_tf_idf.pkl', 'wb')
    pickle.dump(preprocessed_list_of_docs_tokens, pi_file)
    pi_file.close()
    
    return preprocessed_list_of_docs_tokens

# Helper Functions

In [12]:
def create_file_dictionary(list_of_files):
    '''
    Paramteres:
        list_of_files: type(string)
    
    returns: file_dictionary with integer key and path_of_file as value
    '''
    file_dictionary = {}
    for i in range(len(list_of_files)):
        file_dictionary[i] = list_of_files[i]
    
    return file_dictionary

In [13]:
def create_2_way_mapping_of_term_and_doc(global_vocabulary):
    '''
    Parameter:
        global_vocabulary: type(list)
    
    returns: 2 dictionaries with key-value pair as term:docID and docID:term respectively
    '''
    term_vs_ID_dict = {}
    ID_vs_term_dict = {}
    
    for idx,term in enumerate(global_vocabulary):
        ID_vs_term_dict[idx] = term
        term_vs_ID_dict[term] = idx
    
    return term_vs_ID_dict, ID_vs_term_dict

In [14]:
def get_total_words_in_doc(TF_IDF_doc_vs_term_matrix):
    return np.sum(TF_IDF_doc_vs_term_matrix, axis=1)

In [15]:
def get_max_freq_term_in_doc(matrix):
    return np.amax(matrix, axis=1)

In [16]:
def calculate_IDF(inverted_index, total_documents):
    IDF = {}
    for term in inverted_index:
        IDF[term] = np.log10(total_documents/(1 + len(inverted_index[term])))
    return IDF

In [17]:
def Binary_weighing_TF_IDF(TF_IDF_doc_vs_term_matrix):
    '''
    Parameters:
        TF_IDF_doc_vs_term_matrix: type(np.ndarray)
    
    returns: binary weighting scheme
    '''
    return np.where((TF_IDF_doc_vs_term_matrix <= 0),0,1).astype('float')

In [18]:
def Raw_count_weighing_TF_IDF(TF_IDF_doc_vs_term_matrix):
    '''
    Parameters:
        TF_IDF_doc_vs_term_matrix: type(np.ndarray)
    
    returns: raw count weighing scheme
    '''
    return TF_IDF_doc_vs_term_matrix

In [19]:
def Term_frequency_weighing_TF_IDF(TF_IDF_doc_vs_term_matrix, total_words_in_doc):
    '''
    Parameters:
        TF_IDF_doc_vs_term_matrix: type(np.ndarray)
        total_words_in_doc: type(np.array)
    
    returns: term frequency weighing scheme
    '''
    return (TF_IDF_doc_vs_term_matrix.T / total_words_in_doc).T

In [20]:
def Log_Normalization_weighing_TF_IDF(TF_IDF_doc_vs_term_matrix):
    '''
    Parameters:
        TF_IDF_doc_vs_term_matrix: type(np.ndarray)
    
    returns: log normalization weighing scheme
    '''
    return np.log10(1+TF_IDF_doc_vs_term_matrix)

In [21]:
def Double_Normalization_weighing_TF_IDF(TF_IDF_doc_vs_term_matrix, max_freq_term_in_doc):
    '''
    Parameters:
        TF_IDF_doc_vs_term_matrix: type(np.ndarray)
    
    returns: double normalization weighing scheme
    '''
    matrix = (TF_IDF_doc_vs_term_matrix.T / max_freq_term_in_doc).T

    return 0.5 + 0.5*matrix

In [22]:
def compute_variants(matrix, total_terms, max_freq, size_of_global_vocabulary, IDF, ID_vs_term_dict, isQuery):
    '''
    returns: variants of TF-IDF
    '''
    Binary = Binary_weighing_TF_IDF(matrix)
    Raw_count = Raw_count_weighing_TF_IDF(matrix)
    Term_frequency = Term_frequency_weighing_TF_IDF(matrix, total_terms)
    Log_Normalization = Log_Normalization_weighing_TF_IDF(matrix)
    Double_Normalization = Double_Normalization_weighing_TF_IDF(matrix, max_freq)
    
    if(isQuery):
        for i in tqdm(range(size_of_global_vocabulary)):
            IDF_factor = IDF[ID_vs_term_dict[i]]
            Binary[i] *= IDF_factor
            Raw_count[i] *= IDF_factor
            Term_frequency[i] *= IDF_factor
            Log_Normalization[i] *= IDF_factor
            Double_Normalization[i] *= IDF_factor
    else:
        for i in tqdm(range(size_of_global_vocabulary)):
            IDF_factor = IDF[ID_vs_term_dict[i]]
            Binary[:,i] *= IDF_factor
            Raw_count[:,i] *= IDF_factor
            Term_frequency[:,i] *= IDF_factor
            Log_Normalization[:, i] *= IDF_factor
            Double_Normalization[:, i] *= IDF_factor
    
    return Binary, Raw_count, Term_frequency, Log_Normalization, Double_Normalization

In [23]:
def topN(N, TF_IDF, query_of_same_variant, file_dictionary):
    '''
    evaluates and displays top N relevant documents based on score
    '''
    tf_idf_score = np.dot(TF_IDF, query_of_same_variant)
    tf_idf_score = {file_dictionary[i]:tf_idf_score[i] for i in range(len(tf_idf_score))}
    
    relevant_docs = list(sorted(tf_idf_score.items(), key=operator.itemgetter(1),reverse=True))[:N]
    for docs in relevant_docs:
        print('Score: {}  Document: {}'.format(docs[1], docs[0]))
    print()

# Main

In [24]:
def main():
    # create set of stop words for preprocessing
    stopwords_set = set(stopwords.words('english'))
    
    # Get List of Files in Dataset
    list_of_files = getListOfFiles('Dataset/Humor,Hist,Media,Food/')
    
    # create dictionary of file with docID (integer) as key and full_path of file as value
    file_dictionary = create_file_dictionary(list_of_files)
    total_documents = len(file_dictionary)
    
    #preprocessed_list_of_docs_tokens = preprocess_documents(list_of_files, stopwords_set)
    pi_file = open('Q1_tf_idf.pkl', 'rb')
    preprocessed_list_of_docs_tokens = pickle.load(pi_file)
    pi_file.close()
    
    # Global list of terms
    global_list_of_terms = []
    for doc in preprocessed_list_of_docs_tokens:
        global_list_of_terms.extend(doc)
    
    # finding all distinct terms across all documents
    global_vocabulary = list(set(global_list_of_terms))
    size_of_global_vocabulary = len(global_vocabulary)
    
    # term vs docID 2 way mapping
    term_vs_ID_dict, ID_vs_term_dict = create_2_way_mapping_of_term_and_doc(global_vocabulary)
    
    # term-doc matrix (for storing frequency of each word of global_vocabulary in doc)
    inverted_index = {}
    size_of_TF_IDF_matrix = (total_documents, size_of_global_vocabulary)
    TF_IDF_doc_vs_term_matrix = np.zeros(size_of_TF_IDF_matrix, dtype=float)
    
    for i in tqdm(range(len(preprocessed_list_of_docs_tokens))):
        for term in preprocessed_list_of_docs_tokens[i]:
            if(term in inverted_index):
                if(inverted_index[term][-1]!=i):
                    inverted_index[term].append(i)
            else:
                inverted_index[term] = [i]
            TF_IDF_doc_vs_term_matrix[i][term_vs_ID_dict[term]] += 1
    
    total_words_in_doc = get_total_words_in_doc(TF_IDF_doc_vs_term_matrix)
    
    max_freq_term_in_doc = get_max_freq_term_in_doc(TF_IDF_doc_vs_term_matrix)
    
    # Calculate IDF
    IDF = calculate_IDF(inverted_index, total_documents)
    
    Binary_tf_idf, Raw_count_tf_idf, Term_frequency_tf_idf, Log_Normalization_tf_idf, Double_Normalization_tf_idf = compute_variants(TF_IDF_doc_vs_term_matrix, total_words_in_doc, max_freq_term_in_doc, size_of_global_vocabulary, IDF, ID_vs_term_dict, False)
    
    query = input("Input query: ")
    sanitized_query = preprocess(query, stopwords_set)
    print("Sanitized query: ", sanitized_query)

    query_frequency = {}
    for query_token in sanitized_query:
        if query_token in query_frequency:
            query_frequency[query_token]+=1
        else:
            query_frequency[query_token]=1
    
    # create query frequency vector
    query_frequency_vector = np.zeros((size_of_global_vocabulary,1))
    for token in query_frequency.keys():
        if token in global_vocabulary:
            query_frequency_vector[term_vs_ID_dict[token]] = query_frequency[token]
    
    
    max_freq_token = query_frequency[max(query_frequency, key=query_frequency.get)]
    query_binary, query_raw_count, query_term_frequency, query_log_normalization, query_double_normalization = compute_variants(query_frequency_vector, len(sanitized_query), max_freq_token, size_of_global_vocabulary, IDF, ID_vs_term_dict, True)
    
    print('Binary Scheme: Top 5 relevant documents are:')
    topN(5, Binary_tf_idf, query_binary, file_dictionary)
    
    print('Raw Count Scheme: Top 5 relevant documents are:')
    topN(5, Raw_count_tf_idf, query_raw_count, file_dictionary)
    
    print('Term Frequency Scheme: Top 5 relevant documents are:')
    topN(5, Term_frequency_tf_idf, query_term_frequency, file_dictionary)
    
    print('Log Normalization Scheme: Top 5 relevant documents are:')
    topN(5, Log_Normalization_tf_idf, query_log_normalization, file_dictionary)
    
    print('Double_Normalization Scheme: Top 5 relevant documents are:')
    topN(5, Double_Normalization_tf_idf, query_double_normalization, file_dictionary)

In [25]:
if __name__ == "__main__":
    main()

100%|█████████████████████████████████████████████████████████████████████████████| 1133/1133 [00:01<00:00, 746.62it/s]
100%|█████████████████████████████████████████████████████████████████████████| 82779/82779 [00:08<00:00, 10274.44it/s]


Input query: Demo taken successfully by students
Sanitized query:  ['demo', 'taken', 'successfully', 'students']


100%|█████████████████████████████████████████████████████████████████████████| 82779/82779 [00:01<00:00, 81052.48it/s]


Binary Scheme: Top 5 relevant documents are:
Score: [5.41009336]  Document: Dataset/Humor,Hist,Media,Food/basehead.txt
Score: [5.41009336]  Document: Dataset/Humor,Hist,Media,Food/comic_st.gui
Score: [5.41009336]  Document: Dataset/Humor,Hist,Media,Food/hackingcracking.txt
Score: [5.41009336]  Document: Dataset/Humor,Hist,Media,Food/vegan.rcp
Score: [4.88026362]  Document: Dataset/Humor,Hist,Media,Food/cooplaws

Raw Count Scheme: Top 5 relevant documents are:
Score: [49.33246593]  Document: Dataset/Humor,Hist,Media,Food/basehead.txt
Score: [25.47308373]  Document: Dataset/Humor,Hist,Media,Food/jason.fun
Score: [16.44535009]  Document: Dataset/Humor,Hist,Media,Food/practica.txt
Score: [14.21094867]  Document: Dataset/Humor,Hist,Media,Food/humor9.txt
Score: [11.53857608]  Document: Dataset/Humor,Hist,Media,Food/bw.txt

Term Frequency Scheme: Top 5 relevant documents are:
Score: [0.00649293]  Document: Dataset/Humor,Hist,Media,Food/liceprof.sty
Score: [0.00612377]  Document: Dataset/Humor