In [None]:
DATA_DIR = "./BBC/"

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score 

In [None]:
data = load_files(DATA_DIR, encoding="latin1", decode_error="replace")

In [None]:
labels, counts = np.unique(data.target, return_counts=True)
class_dict = dict(zip(np.array(data.target_names)[labels], counts))

plt.title("Distribution of BBC Dataset")
plt.xlabel("classes")
plt.ylabel("number of instances")
plt.bar([1,2,3,4,5], counts)

plt.savefig("BBC-distribution.pdf")

In [None]:
vectorizer = CountVectorizer(stop_words="english", decode_error="ignore")
term_matrix = vectorizer.fit_transform(data["data"])
term_array = term_matrix.toarray()

df = pd.DataFrame(data=term_array, columns=vectorizer.get_feature_names())


In [None]:
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, train_size=0.8, test_size=0.2, random_state=None)

In [None]:

result_file = open("bbc-performance.txt", "a")
result_file.truncate(0)
def Write(text):
    try:
        result_file.write(text)
    except Exception as e:
        print("ERROR occurred when writing to file.")
        print(e)
        sys.exit()


In [None]:
def Test(test_num, x_train, x_test, y_train, y_test, smoothing):
    vectorizer = CountVectorizer(stop_words="english", decode_error="ignore")
    term_matrix = vectorizer.fit_transform(data["data"])
    term_array = term_matrix.toarray()
    total_size = len(term_array)

    if (smoothing != 0):
        classification = MultinomialNB(alpha=smoothing) 
    else: classification = MultinomialNB()
    classification.fit(vectorizer.transform(x_train), y_train)

    y_pred = classification.predict(vectorizer.transform(x_test))

    if (smoothing !=0):
        total_count = classification.feature_count_
        classification.feature_count_ = classification.feature_count_ + smoothing 
        
    else: total_count = classification.feature_count_

    Write("\n *******************************************************************************************************************  \n a) \n MultinomialNB Default values, try " + str(test_num) +
     "\n Here is the confusion matrix: \n b) \n" + str(confusion_matrix(y_test, y_pred)) + "\n c) \n Here is the classification report: \n" + str(classification_report(y_test, y_pred)) + "\n")
    Write(" d) \n accuracy: " + str(accuracy_score(y_test, y_pred)) + "\n F1 macro average : " + str(f1_score(y_test, y_pred, average="macro")) + "\n F1 weighted average : " + str(f1_score(y_test, y_pred, average="weighted")))

    Write("\n e)")
    for key, value in class_dict.items():
        prior_prob = value/total_size
        Write("\n Prior probability of " + key + ": " + str(prior_prob))

    vocab_size = len(vectorizer.get_feature_names())
    Write("\n f) \nThe size of the vocabulary is: " + str(vocab_size))

    Write("\n g)")
    for i in range(0,len(classification.feature_count_)):
        Write("\nNumber of word tokens in the class " + data.target_names[i] + ": " + str(classification.feature_count_[i].sum()))
        
    Write("\n h)")
    total_wt = 0
    for i in range(0,len(classification.feature_count_)):
       total_wt = total_wt + total_count[i].sum()
    total_wt = total_wt + (vocab_size * smoothing)
    Write("\nNumber of word tokens in the entire corpus: " + str(total_wt))

    Write("\n i)")
    nb_0 = 0
    perc_0 = 0
    for i in range(0,len(classification.feature_count_)):
        nb_0 = 0
        for j in range(0,len(classification.feature_count_[i])):
            if classification.feature_count_[i][j] == 0:
                nb_0 = nb_0 + 1
        perc_0 = (nb_0/classification.feature_count_[i].sum())*100
        Write("\nNumber of words with frequency 0 in class " + data.target_names[i] + ": " + str(nb_0) + "\nPercentage: " + str(perc_0) + "%")
    
    Write("\n j)")
    nb_1 = 0
    perc_1 = 0
    for i in range(0,len(classification.feature_count_)):
        for j in range(0,len(classification.feature_count_[i])):
            if classification.feature_count_[i][j] == 1:
                nb_1 = nb_1 + 1 
    perc_1 = (nb_1/total_wt)*100  
    Write("\nNumber of words with frequency 1 in the entire corpus: " + str(nb_1) + "\nPercentage: " + str(perc_1) + "%")    

    woman_index = vectorizer.get_feature_names().index("woman")
    man_index = vectorizer.get_feature_names().index("man")
    i = 0
    log_prob_woman = 0
    log_prob_man = 0
    weighted_log_prob_woman = 0
    weighted_log_prob_man = 0
    for key, value in class_dict.items():
        prior_prob = value/total_size
        log_prob_woman = classification.feature_log_prob_[i][woman_index]
        log_prob_man = classification.feature_log_prob_[i][man_index]
        i = i + 1
        weighted_log_prob_woman = weighted_log_prob_woman + (log_prob_woman * prior_prob)
        weighted_log_prob_man = weighted_log_prob_man + (log_prob_man * prior_prob)
    Write("\n k) \n2 selected favorite words: woman and man. \nLog-prob of woman: " + str(weighted_log_prob_woman) + "\nLog-prob of man: " + str(weighted_log_prob_man))
    
Test(1, x_train, x_test, y_train, y_test, 0)
Test(2, x_train, x_test, y_train, y_test, 0)
Test(3, x_train, x_test, y_train, y_test, 0.0001)
Test(4, x_train, x_test, y_train, y_test, 0.9)
result_file.close()
    