# Import Statements

In [1]:
import pandas as pd
import numpy as np
import string
import os
import pickle
import random
import math
import heapq
import operator
from tqdm import tqdm

In [2]:
from sortedcontainers import SortedDict, SortedList, SortedSet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Read From Files

In [3]:
def getListOfFiles(directory):
    '''
    Parameters:
        directory: type(string)
        
    returns: list of all files in directory with the full path of file
    '''
    
    list_of_files = []
    
    for file_path in os.listdir(directory):
        full_path = os.path.join(directory, file_path)
        if os.path.isfile(full_path):
            list_of_files.append(full_path)
    
    return list_of_files

# Preprocessing Functions

In [4]:
def lowercase(data):
    '''
    Parameters:
        data: type(string)
    
    returns: lowercase of data
    '''
    
    return data.lower()

In [5]:
def perform_word_tokenize(corpus):
    '''
    Parameters:
        corpus: type(string)
    
    returns word-level tokenization of corpus
    '''
    
    return word_tokenize(corpus)

In [6]:
def remove_stopwords_from_tokens(tokens, stopwords_set):
    '''
    Parameters:
        tokens: type(list)
        stopwords_set: type(set)
    
    returns: tokens without stopwords
    '''
    tokens_sans_stopwords = [x for x in tokens if x not in stopwords_set]
    
    return tokens_sans_stopwords

In [7]:
def remove_punctuation_from_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without punctuation
    '''
    tokens_sans_punctuation = [x.translate(str.maketrans('', '', string.punctuation)) for x in tokens]
    
    return tokens_sans_punctuation

In [8]:
def remove_blank_space_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without blank tokens
    '''
    tokens_sans_blank_space = [x for x in tokens if x!='']
    
    return tokens_sans_blank_space

In [9]:
def stemming(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    retruns: tokens after stemming
    '''
    stemmer = PorterStemmer()
    stem_tokens = [stemmer.stem(x) for x in tokens]
    return stem_tokens

In [10]:
def lemmatize_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: returns tokens after lemmatization
    '''
    lemmatizer = WordNetLemmatizer()
    lemmatize_tokens = [lemmatizer.lemmatize(x) for x in tokens]
    #unique_lemmatize_tokens = list(dict.fromkeys(lemmatize_tokens))
    
    return lemmatize_tokens

In [11]:
def preprocess(corpus, stopwords_set):
    # Convert the text to lower case
    lowercase_corpus = lowercase(corpus)
    #print(len(lowercase_corpus))
    
    # Perform word tokenization (word_tokenize also takes care of whitespace)
    word_tokens = perform_word_tokenize(lowercase_corpus)
    #print(len(word_tokens))
    
    # Remove stopwords from tokens
    word_tokens_sans_stopwords = remove_stopwords_from_tokens(word_tokens, stopwords_set)
    #print(len(word_tokens_sans_stopwords))
    
    # Remove punctuation marks from tokens
    word_tokens_sans_punctuation = remove_punctuation_from_tokens(word_tokens_sans_stopwords)
    #print(len(word_tokens_sans_punctuation))
    
    # Remove blank space tokens
    word_tokens_sans_blank_tokens = remove_blank_space_tokens(word_tokens_sans_punctuation)
    #print(len(word_tokens_sans_blank_tokens))
    
    # Stem tokens
    word_tokens_final = stemming(word_tokens_sans_blank_tokens)
    
    # Lemmatize tokens
    #word_tokens_final = lemmatize_tokens(word_tokens_sans_blank_tokens)
    #print(len(word_tokens_final))
    
    return word_tokens_final

# Helper Functions

In [12]:
# def create_file_dictionary(list_of_files):
#     '''
#     Paramteres:
#         list_of_files: type(string)
    
#     returns: file_dictionary with integer key and path_of_file as value
#     '''
#     file_dictionary = {}
#     for i in range(len(list_of_files)):
#         file_dictionary[i] = list_of_files[i]
    
#     return file_dictionary

In [13]:
# def create_file_list(list_of_files):
#     '''
#     Paramteres:
#         list_of_files: type(string)
    
#     returns: file_list with path_of_file as values
#     '''
#     file_dictionary = []
#     for i in range(len(list_of_files)):
#         file_dictionary[i] = list_of_files[i]
    
#     return file_dictionary

In [14]:
def train_test_split_function(train_frac, list_of_files_class_c):
    '''
    Parameters:
        train_frac: type(float)
        
    returns: train test list in ratio train_frac:1-train_frac
    '''
    random.shuffle(list_of_files_class_c)
    train_size = int(train_frac*len(list_of_files_class_c))
    train_list_class_c = list_of_files_class_c[:train_size]
    test_list_class_c = list_of_files_class_c[train_size:]
    
    return train_list_class_c, test_list_class_c

In [15]:
def preprocess_documents(list_of_files, num_of_classes, stopwords_set):
    '''
    Parameters:
        list_of_files: type(list)
        num_of_classes: type(int)
        stopwords_set: type(set)
    
    returns: list of tokens obtained by preprocessing documents in all classes
    '''
    preprocessed_list_of_docs_tokens = []
    for c in tqdm(range(num_of_classes)):
        doc_tokens_class_c = []
        for doc_path in list_of_files[c]:
            file = open(doc_path, 'r', encoding='utf-8', errors='ignore')
            file_corpus = file.read()
            file.close()
            doc_tokens = preprocess(file_corpus, stopwords_set)
            doc_tokens_class_c.append(doc_tokens)
        preprocessed_list_of_docs_tokens.append(doc_tokens_class_c)
    
    pi_file = open('Q3_pkl_file', 'wb')
    pickle.dump(preprocessed_list_of_docs_tokens, pi_file)
    pi_file.close()
    
    return preprocessed_list_of_docs_tokens

In [16]:
def get_tf_icf_list(train_test_lists, num_of_classes):
    tf_list_of_dict = []
    for c in range(num_of_classes):
        tf_list_of_dict.append({})
    cf_dict = {}
    icf_dict = {}
    for c in range(num_of_classes):
        for doc_terms in train_test_lists[c][0]:
            for term in doc_terms:
                if(term in tf_list_of_dict[c]):
                    tf_list_of_dict[c][term]+=1
                else:
                    tf_list_of_dict[c][term]=1
    
    for tf_dict_class_c in tf_list_of_dict:
        for term in tf_dict_class_c:
            if(term in cf_dict):
                cf_dict[term]+=1
            else:
                cf_dict[term]=1
    
    for term in cf_dict:
        icf_dict[term] = math.log2(num_of_classes/cf_dict[term])
    
    tf_icf_list_of_dict = []
    for c in range(num_of_classes):
        tf_icf_list_of_dict.append({})
    for c in range(num_of_classes):
        for term in tf_list_of_dict[c]:
            tf_icf_list_of_dict[c][term] = tf_list_of_dict[c][term]*icf_dict[term]
    
    return tf_list_of_dict, icf_dict, tf_icf_list_of_dict

In [17]:
def Q3(train_test_lists, num_of_classes, top_k):
    # tf-icf
    tf_list_of_dict, icf_dict, tf_icf_list_of_dict = get_tf_icf_list(train_test_lists, num_of_classes)
    list_of_vocab_class_c = []
    for c in range(num_of_classes):
        vocab_class_c_temp = heapq.nlargest(top_k, tf_icf_list_of_dict[c].items(), key=operator.itemgetter(1))
        vocab_class_c = {}
        for tup in vocab_class_c_temp:
            vocab_class_c[tup[0]]=tup[1]
        list_of_vocab_class_c.append(vocab_class_c)
    
    # global vocabulary
    global_vocab = {}
    for c in range(num_of_classes):
        global_vocab = global_vocab | list_of_vocab_class_c[c].keys()
    
    # calculate priors
    num_docs_class_c = []
    total_docs = 0
    for c in range(num_of_classes):
        total_docs+=len(train_test_lists[c][0])
        num_docs_class_c.append(len(train_test_lists[c][0]))
    prior_class_c = [x/total_docs for x in num_docs_class_c]
    
    num_terms_class_c_vocab_c = []
    for c in range(num_of_classes):
        summation=0
        for term in global_vocab:
            if(term in tf_icf_list_of_dict[c].keys()):
                summation+=tf_list_of_dict[c][term]
        num_terms_class_c_vocab_c.append(summation)
    
    # testing
    confusion_matrix = np.zeros((num_of_classes,num_of_classes))
    total_predictions = 0
    for c in range(num_of_classes):
        for test_doc_tokens in train_test_lists[c][1]:
            posterior_class_c = []
            for cc in range(num_of_classes):
                posterior_class_c.append(prior_class_c[cc])
            for term in test_doc_tokens:
                for class_c in range(num_of_classes):
                    if(term not in global_vocab):
                        continue
                    temp = 1.0
                    if(term in list_of_vocab_class_c[class_c]):
                        temp = (1 + tf_list_of_dict[class_c][term])/(len(global_vocab) + num_terms_class_c_vocab_c[class_c])
                    else:
                        temp = 1/(len(global_vocab) + num_terms_class_c_vocab_c[class_c])
                    posterior_class_c[class_c] = posterior_class_c[class_c]*temp
            predicted_class = posterior_class_c.index(max(posterior_class_c))
            total_predictions+=1
            confusion_matrix[c][predicted_class]+=1
    print(confusion_matrix)
    
    return 100*np.trace(confusion_matrix)/total_predictions

# Main

In [18]:
def main():
    random.seed(0)
    num_of_classes = 5
    train_fractions = [0.5, 0.7, 0.8]
    
    # create set of stop words for preprocessing
    stopwords_set = set(stopwords.words('english'))
    
    # Get List of Files in Dataset
    list_of_files_class_0 = getListOfFiles('Dataset/20_newsgroups/comp.graphics')
    list_of_files_class_1 = getListOfFiles('Dataset/20_newsgroups/sci.med')
    list_of_files_class_2 = getListOfFiles('Dataset/20_newsgroups/talk.politics.misc')
    list_of_files_class_3 = getListOfFiles('Dataset/20_newsgroups/rec.sport.hockey')
    list_of_files_class_4 = getListOfFiles('Dataset/20_newsgroups/sci.space')
    list_of_files = [list_of_files_class_0, list_of_files_class_1, list_of_files_class_2, list_of_files_class_3, list_of_files_class_4]
    
    # Preprocess documents
    #preprocessed_list_of_docs_tokens = preprocess_documents(list_of_files, num_of_classes, stopwords_set)
    pi_file = open('Q3_pkl_file', 'rb')
    preprocessed_list_of_docs_tokens = pickle.load(pi_file)
    pi_file.close()
    
    top_k = int(input('Enter value of k: '))
    print()
    for train_frac in train_fractions:
        train_test_lists = []
        for class_num in range(num_of_classes):
            train_list_class_c, test_list_class_c = train_test_split_function(train_frac, preprocessed_list_of_docs_tokens[class_num])
            train_test_lists.append([train_list_class_c, test_list_class_c])
        print()
        accuracy = Q3(train_test_lists, num_of_classes, top_k)
        print('Accuracy for train split fraction ',train_frac, ' is ',accuracy,'%')

In [19]:
if __name__ == "__main__":
    main()

Enter value of k: 10


[[498.   1.   0.   0.   1.]
 [  6. 494.   0.   0.   0.]
 [  4.  33. 462.   0.   1.]
 [  0.   0.   0. 500.   0.]
 [ 11.   0.   0.   0. 489.]]
Accuracy for train split fraction  0.5  is  97.72 %

[[299.   1.   0.   0.   0.]
 [  1. 299.   0.   0.   0.]
 [  2.  22. 275.   0.   1.]
 [  0.   0.   0. 300.   0.]
 [  6.   1.   0.   0. 293.]]
Accuracy for train split fraction  0.7  is  97.73333333333333 %

[[200.   0.   0.   0.   0.]
 [  2. 197.   0.   1.   0.]
 [  2.  17. 181.   0.   0.]
 [  0.   0.   0. 200.   0.]
 [  2.   1.   0.   0. 197.]]
Accuracy for train split fraction  0.8  is  97.5 %
