In [None]:
import os
import math
import json
import sys
from collections import Counter
import pickle5 as pickle
from bs4 import BeautifulSoup
import nltk
import numpy as np
from nltk.corpus import stopwords

# Tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
 
# Lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Count-Vectorizer & TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
vectorizer = CountVectorizer()
tfidf = TfidfTransformer()

# Naive-Bayes models
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

# KNN classifiers
from sklearn.neighbors import KNeighborsClassifier

# for F1 scores/classification_reports
from sklearn.metrics import f1_score 

nltk.download('wordnet') 
nltk.download('stopwords')

In [None]:
# =====================================================================================
# TO BE TAKEN AS SYS.ARG[1]
data_path = os.getcwd() + "/Dataset"
# =====================================================================================

stop_words = set(stopwords.words('english'))
# Adding custom words to the stop-words list
cust_stop_words = ["'s"]
for temp in cust_stop_words:
    stop_words.add(temp)

def cleanup_text(text):
    """
    Input: Un-processed text (str)

    Output: Processed text (str)
    """
    # Tokenizing
    tokens = tokenizer.tokenize(text.lower())
    filtered_tokens = []
    # Stop-Word Removal + Lemmatization
    for token in tokens:
        if(token not in stop_words):
            filtered_tokens.append(lemmatizer.lemmatize(token))
    
    return " ".join(filtered_tokens)

def create_dataset(path):
    class1_path = data_path + "/class1"
    class2_path = data_path + "/class2"

    class1_train_file_list = os.listdir(class1_path + "/train")
    class1_test_file_list = os.listdir(class1_path + "/test")

    class2_train_file_list = os.listdir(class2_path + "/train")
    class2_test_file_list = os.listdir(class2_path + "/test")
    

    train_lbl= [1]*len(class1_train_file_list) + [2]*len(class2_train_file_list)
    test_lbl= [1]*len(class1_test_file_list) + [2]*len(class2_test_file_list)
    train_arr, test_arr = [], []

    for file in class1_train_file_list:
        text = open(class1_path + "/train/" + file, 'rb').read().decode(errors='replace')
        train_arr.append(cleanup_text(text))
    
    for file in class2_train_file_list:
        text = open(class2_path + "/train/" + file, 'rb').read().decode(errors='replace')
        train_arr.append(cleanup_text(text))
    
    for file in class1_test_file_list:
        text = open(class1_path + "/test/" + file, 'rb').read().decode(errors='replace')
        test_arr.append(cleanup_text(text))

    for file in class2_test_file_list:
        text = open(class2_path + "/test/" + file, 'rb').read().decode(errors='replace')
        test_arr.append(cleanup_text(text))

    return train_arr, train_lbl, test_arr, test_lbl, len(class1_train_file_list)     

def feature_selection(x_traintf, y_train, x_testtf, k_best):
    selector = SelectKBest(mutual_info_classif, k=k_best)
    selector.fit(x_traintf, y_train)
    x_train = selector.transform(x_traintf)
    x_test = selector.transform(x_testtf)

    return x_train, x_test

def naive_bayes(x_traintf, y_train, x_testtf, y_test):
    k_best = [1,10,100,1000,10000]
    mnb = MultinomialNB()
    bnb = BernoulliNB()

    text = "Multinomial Naive Bayes f1-scores \n"
    # Multinomial-NB
    for entry in k_best:
        x_train, x_test = feature_selection(x_traintf, y_train, x_testtf, entry)
        mnb.fit(x_train.toarray(), y_train)
        yhat = mnb.predict(x_test.toarray())
        f1 = f1_score(y_test, yhat, average='micro')
        text = text + "Top " + str(entry) + "-features = " + str(round(f1,5)) + "\n"
    
    text = text + "\n" + "Bernoulli Naive Bayes f1-scores \n"
    # Gaussian-NB
    for entry in k_best:
        x_train, x_test = feature_selection(x_traintf, y_train, x_testtf, entry)
        bnb.fit(x_train, y_train)
        yhat = bnb.predict(x_test)
        f1 = f1_score(y_test, yhat, average='micro')
        text = text + "Top " + str(entry) + "-features = " + str(round(f1,5)) + "\n"
    text = text + "\n"

    return text
    
def rocchio(x_train, y_train, x_test, y_test, n_class1):
    b = [0,0.01,0.05,0.1]
    centroid = np.zeros((2,x_train.shape[1]))
    centroid[0] = np.sum(x_train[:n_class1], axis=0)/n_class1
    centroid[1] = np.sum(x_train[n_class1:], axis=0)/(len(y_train)-n_class1)
    
    text = "Rocchio Classifier f1-scores \n"
    for entry in b:
        yhat = np.zeros((x_test.shape[0],1))
        for i in range(x_test.shape[0]):
            dist_1 = np.linalg.norm(centroid[0] - x_test[i])
            dist_2 = np.linalg.norm(centroid[1] - x_test[i])
            if(dist_1 < (dist_2 - entry)):
                yhat[i] = 1
            elif(dist_2 < (dist_1 - entry)):
                yhat[i] = 2
            else:
                yhat[i] = 0
        f1 = f1_score(y_test, yhat, average='micro')
        text = text + "b-" + str(entry) + " = " + str(round(f1,5)) + "\n"
    text = text + "\n"

    return text

def knn(x_train, y_train, x_test, y_test):
    nn = [1,10,50]
    text = "KNN Classifier f1-scores \n"
    for entry in nn:
        knn = KNeighborsClassifier(n_neighbors=entry)
        knn.fit(x_train, y_train)
        yhat = knn.predict(x_test)
        f1 = f1_score(y_test, yhat, average='micro')
        text = text + str(entry) + "-neighbours = " + str(round(f1,5)) + "\n"

    return text   

In [None]:
# Forming train/test dataset for each class
train_arr, train_lbl, test_arr, test_lbl, n_class1 = create_dataset(data_path)

# Count-Vectorization
vectorizer.fit(train_arr)
train_mat = vectorizer.transform(train_arr)
test_mat = vectorizer.transform(test_arr)

# Tf-idf Transformer
tfidf.fit(train_mat)
train_tfmat = tfidf.transform(train_mat)
test_tfmat = tfidf.transform(test_mat)

In [None]:
print(naive_bayes(train_mat, train_lbl, test_mat, test_lbl))
print(rocchio(train_tfmat.toarray(), train_lbl, test_tfmat.toarray(), test_lbl, n_class1))
# print(knn(train_tfmat, train_lbl, test_tfmat, test_lbl))