# Bayes Theorem Report CS361 A3
#### UPI: ytia165, ID: 402799865
*****


It should explain and motivate the chosen representation & data preprocessing, then the method extensions, their implementation (including the implementation of the standard Naive Bayes) and the performance (training and validation) results for the standard and extended Naive Bayes method; Please comment how did you perform model evaluation/validation.
You can summarize results using tables (or plots), but all results have to be explained descriptively as well.

# Code for Classifier Based on Text Inputs
*****

In [1]:
import csv

import numpy as np

### Reading in CSV files

In [2]:
classes = []
train_text = []
with open("trg.csv") as input_csv:
    reader = csv.reader(input_csv, delimiter=",", quotechar='"')
    next(reader)
    for row in reader:
        classes.append(row[1])
        train_text.append(row[2])
        
# change categorical classes to numeric class
unique_classes = sorted(set(classes))
class_to_id = {x: unique_classes.index(x) for x in unique_classes}
id_to_class = dict([(value, key) for key, value in class_to_id.items()])
classes_numeric = np.array([class_to_id[x] for x in classes])

### Bayes Theorem and Validation functions 

In [3]:
def train_test_split(x, y, test_split):
    '''
    Returns training and test set 

            Parameters (in this instance):
                    x (list/numpy.ndarry): A list of explanatory variables
                    y (list/numpy.ndarry): A list of response variables 
                    test_split (int): The proprotion cutoff of test to train sets

            Returns:
                    x[train], x[test] (numpy.ndarry): explanatory training and test set
                    y[train], y[test] (numpy.ndarry): response training and test set
    '''
    n_test = int(test_split * len(x))
    x, y = np.array(x), np.array(y)
    perm = np.random.default_rng(seed = 402799865).permutation(len(x))
    test, train = perm[:n_test], perm[n_test:]
    
    return x[train], x[test], y[train], y[test]

In [4]:
def class_prob(y_train):
    '''Returns the probability of a class (format: dictionary)'''
    unique, counts = np.unique(y_train, return_counts=True)
    dicY_counts = dict(zip(unique, counts))
    dicY_prob = {}
    for key, value in dicY_counts.items():
        dicY_prob[key] = value/sum(dicY_counts.values())
        
    return dicY_prob

In [5]:
def dictionary(x_train, y_train, stopwords = []):
    '''
    Returns all given words and words subsetted by class

            Parameters (in this instance):
                    x_train (numpy.ndarry): A list of explanatory variables
                    y_train (numpy.ndarry): A list of response variables 
                    stopwords (list): Words to not include in the dictionary

            Returns:
                    dictWordsFull (dictionary): All words 
                    listdict (dictionaries in list): All words subsetted by class
    '''
    dictWordsFull = {}
    listdict = [{} for _ in range(len(unique_classes))]
    for array in range(0, len(x_train)-1):
        for word in x_train[array].split(): 
            if word not in stopwords:
                if word not in dictWordsFull:
                    dictWordsFull[word] = 1  
                else:
                    dictWordsFull[word] += 1

                if word not in listdict[y_train[array]]:
                    listdict[y_train[array]][word] = 1
                else:
                    listdict[y_train[array]][word] += 1
                    
    return dictWordsFull, listdict

In [22]:
def dictionary_prob_log(dictWordsFull, listdict):
    '''Returns the probability of a word appearing given the class (format: dictionaries in list)'''
    listdictprob = [{} for _ in range(len(unique_classes))]
    for key in dictWordsFull:
        for classi in range(0,len(unique_classes)):
            if key not in listdict[classi]:
                listdictprob[classi][key] = 1/(sum(listdict[classi].values())+len(dictWordsFull))
            else:
                listdictprob[classi][key] = (listdict[classi][key]+1)/(sum(listdict[classi].values())+len(dictWordsFull))
                
    return listdictprob

In [21]:
def class_calculate(words, listdictprob, dicY_prob):
    '''Returns the most likely class based on Baysean Probability'''
    classes = [[prob] for prob in dicY_prob.values()]
    for word in words.split():
        for classi in range(0,len(unique_classes)):
            if word in listdictprob[classi]:
                classes[classi].append(listdictprob[classi][word])
    
    maxSum = []
    for lst in classes:
        maxSum.append(np.prod(lst))

    return maxSum.index(max(maxSum))

In [8]:
def test_accuracy(x_test, listdictprob, dicY_prob, y_test):
    '''Returns accuracy based on x and y test sets'''
    test_array = []
    for array in x_test:
        test_class = class_calculate(array, listdictprob, dicY_prob)
        test_array.append(test_class)
    test_array_class = np.array([id_to_class[c] for c in test_array])
    
    return sum(1 for x,y in zip(test_array,y_test) if x == y) / len(test_array)

In [9]:
def classi_identifier_test(string, classi, test_split = 0.2):
    '''
    Returns accuracy of model

            Parameters (in this instance):
                    string (list/numpy.ndarry): A list of explanatory variables
                    classi (list/numpy.ndarry): A list of response variables 

            Returns:
                    accuracy (int): The accuracy of the model
    '''
    x_train, x_test, y_train, y_test = train_test_split(string, classi, test_split)
    dicY_prob = class_prob(y_train)
    dictWordsFull, listdict = dictionary(x_train, y_train)
    listdictprob = dictionary_prob_log(dictWordsFull, listdict)
    accuracy = test_accuracy(x_test, listdictprob, dicY_prob, y_test)
    
    return accuracy

In [10]:
print(classi_identifier_test(train_text, classes_numeric))

0.94


In [11]:
def classi_identifier_test_kfold(string, classi, fold = 10):
    '''Returns accuracy of model based on k-fold cross validation'''
    accuracy_lst = []
    perm = np.random.default_rng(seed = 402799865).permutation(len(string))
    chunks = [perm[x:x+int(len(perm)/fold)] for x in range(0, len(perm), int(len(perm)/fold))]
    string, classi = np.array(string), np.array(classi)
    
    for i in range(0,fold):
        training_lst = chunks[:i] + chunks[i+1:]
        training_lst_concat = [j for i in training_lst for j in i]
        string_train, classi_train = string[training_lst_concat], classi[training_lst_concat]
        string_test, classi_test = string[chunks[i]], classi[chunks[i]]
        
        dicY_prob = class_prob(classi_train)
        dictWordsFull, listdict = dictionary(string_train, classi_train)
        listdictprob = dictionary_prob_log(dictWordsFull, listdict)
        accuracy = test_accuracy(string_test, listdictprob, dicY_prob, classi_test)
        accuracy_lst.append(accuracy)
        
    return accuracy_lst, np.mean(accuracy_lst)

In [23]:
print(classi_identifier_test_kfold(train_text, classes_numeric))

([0.2075, 0.185, 0.2175, 0.18, 0.1875, 0.1925, 0.185, 0.17, 0.19, 0.195], 0.191)


# Training with Entire Dataset

In [13]:
abstracts = []
with open("tst.csv") as input_csv:
    reader = csv.reader(input_csv, delimiter=",", quotechar='"')
    next(reader)
    for row in reader:
        abstracts.append(row[1])

In [14]:
def class_output(xpreds, listdictprob, dicY_prob):
    '''Class Identifier but for full dataset'''
    lst = []
    for array in xpreds:
        class_pred = class_calculate(array, listdictprob, dicY_prob)
        lst.append(class_pred)
    lst2 = np.array([id_to_class[c] for c in lst])
    
    return lst2

In [15]:
def classi_identifier_full(string, classi, xpreds, to_csv = False, stopw = []):
    '''MAIN Class Identifier but for full dataset'''
    dicY_prob = class_prob(classi)
    dictWordsFull, listdict = dictionary(string, classi, stopwords)
    listdictprob = dictionary_prob_log(dictWordsFull, listdict)
    output = class_output(xpreds, listdictprob, dicY_prob)
    if to_csv == True:
        np.savetxt("ytia165_CS361_A3_PREDICTIONS2.csv", output, delimiter =", ", fmt ='% s')
        
    return output

In [16]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [17]:
first_try = classi_identifier_full(train_text, classes_numeric, abstracts, to_csv = False)

### Incorporating stopwords

In [18]:
second_try = classi_identifier_full(train_text, classes_numeric, abstracts, to_csv = False, stopw = stopwords)

### Complement Naive Bayes

In [19]:
def dictionary_prob_complement(dictWordsFull, listdict):
    '''Returns the probability of a word appearing given the class (format: dictionaries in list)'''
    listdictprob = [{} for _ in range(len(unique_classes))]
    for key in dictWordsFull:
        for classi in range(0,len(unique_classes)):
            listdictprob[classi][key] = 1/((dictWordsFull[key]-listdict[classi][key]+1)/(sum(dictWordsFull.values())-sum(listdict[classi].values())))
                
    return listdictprob

In [20]:
def class_calculate_mult(words, listdictprob, dicY_prob):
    '''Returns the most likely class based on Baysean Probability'''
    classes = [[prob] for prob in dicY_prob.values()]
    for word in words.split():
        for classi in range(0,len(unique_classes)):
            if word in listdictprob[classi]:
                classes[classi].append(listdictprob[classi][word])
    
    minSum = []
    for lst in classes:
        minSum.append(np.prod(lst))

    return minSum.index(min(minSum))