In [124]:
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
import numpy as np
import pandas as pd
å

In [125]:
cats = [
    'comp.sys.ibm.pc.hardware', 
    'comp.sys.mac.hardware', 
    'rec.sport.baseball', 
    'rec.sport.hockey', 
    'talk.politics.guns'    
]

newsgroups_train = fetch_20newsgroups(subset='train', categories=cats, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats, remove=('headers', 'footers', 'quotes'))
print(newsgroups_train.target_names)

['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.sport.baseball', 'rec.sport.hockey', 'talk.politics.guns']


In [126]:
# Create a list of stop words 
stop_words_list_filepath = './stopwords_long.txt'
stop_words_list = []

with open(stop_words_list_filepath, 'r') as f:
    stop_words_list = [word.strip() for word in f.readlines()]

In [127]:
# Remove escape sequences, remove punctuation, strip leading white spaces

import string

def remove_punctuation(input_string): 
    translator=str.maketrans(" ", " ", string.punctuation)
    return input_string.translate(translator)

def remove_escapes(input_string):
    escapes = ''.join([chr(char) for char in range(1,32)])
    translator = str.maketrans("", "", escapes)
    return input_string.translate(translator)

def remove_stop_words(input_string, stop_words_list): 
    tokens_list = input_string.split(' ')
    filtered_tokenized_list = [word for word in tokens_list if not word in stop_words_list
                              and len(word)>0
                              and not word.isnumeric()]
    return filtered_tokenized_list

def tokenize_text(raw_text, stop_words_list): 
    cleaned_text = remove_punctuation(remove_escapes(raw_text.strip().lower().replace('\n', ' ')))
    tokens_list = remove_stop_words(cleaned_text, stop_words_list)
    return tokens_list

In [128]:
def create_vocab(data, stop_words_list):
    # input is a list of text / strings
    vocab = []
    word_freq_dict = {}
    for d in data:
        curr_list = tokenize_text(d, stop_words_list)
        for word in curr_list:
            if word not in vocab:
                vocab.append(word)
            if word not in word_freq_dict:
                word_freq_dict[word] = 0
            word_freq_dict[word]+=1
    wf_counter = Counter(word_freq_dict)
    return wf_counter

def create_bernoulli_vector(raw_text, vocab, stop_words_list):
    bernoulli_vector = np.zeros((len(vocab),1)) # vector for the document (|V| x 1)
    tokens_list = tokenize_text(raw_text, stop_words_list) # tokenize the text
    for w in tokens_list:
        if w in vocab:
            bernoulli_vector[vocab.index(w)] = 1
    return bernoulli_vector

In [131]:
 class BClassifier:
    #naive Bayes Bernoulli Classifier (Question set 2)
    def __init__(self,data, targets, k, stop_words_list):
        self.k = k

        self.wf_counter = create_vocab(data, stop_words_list)
        self.most_common_k_words = self.wf_counter.most_common(k)
        self.vocab = [w for w, f in self.most_common_k_words]
        self.bvs = []
        self.targets = []
        i = 0
        for d in data:
            v = create_bernoulli_vector(d, self.vocab, stop_words_list)
            if v.sum() > 0:
                self.targets.append(targets[i])
                self.bvs.append(v)
            i = i + 1
        self.class_frequencies = Counter(self.targets)
        self.num_classes = len(self.class_frequencies)
        self.class_priors = [self.class_frequencies[class_idx] / float(self.num_classes) for class_idx in range(self.num_classes)]
        size_vocab = len(self.vocab)
        self.thetas = np.zeros((size_vocab, self.num_classes), float)
    
        for vec, tgt in zip(self.bvs, self.targets):
            self.thetas[:, tgt] += np.squeeze(vec)
        # laplace smoothing
        self.thetas = self.thetas + 1
        for i in range(self.num_classes):
            self.thetas[:,i] = self.thetas[:, i] / (self.class_frequencies[i] + 2)
    
    #Question 2B -- predict function
    def predict(self, doc, stop_words_list):
    
        tbv = create_bernoulli_vector(doc, list(self.vocab), stop_words_list)
        scores = np.zeros(self.num_classes)
        
        for class_idx in range(self.num_classes):
            prior = self.class_priors[class_idx]
            scores[class_idx] = np.log(prior)
            #t0 = datetime.datetime.now()
            #print('class %d @ %s' % (class_idx, t0.timestamp()))
            #dscores = np.sum(np.multiply(tbv_c, np.log(1.0-pbi))) + np.sum(np.multiply(tbv, np.log(pbi)))
            #scores[class_idx] += dscores
            for word_idx in range(len(self.vocab)):
                if tbv[word_idx] != 1:
                    scores[class_idx] = scores[class_idx] +np.log(1.0-self.thetas[word_idx,class_idx])
                else:
                    scores[class_idx] = scores[class_idx] + np.log(self.thetas[word_idx, class_idx])
        #t1 = datetime.datetime.now()
        #print('class %d @ %s' % (class_idx, t1.timestamp()))
        c = np.argmax(scores)
        return c
    
    

In [132]:
x = BClassifier(newsgroups_train.data, newsgroups_train.target, 10, stop_words_list)

In [133]:
ks = [10,100,1000, 10000]
ks = [10,100,1000, 10000]
bnbcs = {}
for k in ks:
    bnbcs[k]= BClassifier(newsgroups_train.data, newsgroups_train.target, k, stop_words_list)


In [113]:
print('Bernoulli Classifier Question 2A')
bnbc10 = bnbcs[10]
print(bnbc10.num_classes)
print(bnbc10.most_common_k_words)
for t in range(bnbc10.num_classes):
    print(t)
    df = pd.DataFrame(columns=['word', 'class_likelihood'])
    df['word'] = bnbc10.vocab
    df['class_likelihood'] = bnbc10.thetas[:,t]
    tops = df.sort_values(by='class_likelihood', ascending=False).head(10)
    print('Topic ', cats[t])
    display(tops)



Bernoulli Classifier Question 2A
5
[('drive', 713), ('team', 631), ('well', 630), ('time', 630), ('people', 620), ('game', 619), ('year', 591), ('gun', 499), ('play', 432), ('games', 428)]
0
Topic  comp.sys.ibm.pc.hardware


Unnamed: 0,word,class_likelihood
0,drive,0.510288
3,time,0.345679
2,well,0.213992
4,people,0.135802
6,year,0.057613
5,game,0.032922
9,games,0.028807
8,play,0.020576
1,team,0.004115
7,gun,0.004115


1
Topic  comp.sys.mac.hardware


Unnamed: 0,word,class_likelihood
0,drive,0.433333
2,well,0.280952
3,time,0.27619
4,people,0.228571
6,year,0.104762
8,play,0.052381
9,games,0.038095
5,game,0.02381
1,team,0.004762
7,gun,0.004762


2
Topic  rec.sport.baseball


Unnamed: 0,word,class_likelihood
6,year,0.402332
5,game,0.311953
1,team,0.285714
2,well,0.276968
9,games,0.271137
3,time,0.25656
8,play,0.180758
4,people,0.12828
0,drive,0.020408
7,gun,0.005831


3
Topic  rec.sport.hockey


Unnamed: 0,word,class_likelihood
1,team,0.44686
5,game,0.398551
8,play,0.299517
2,well,0.2657
9,games,0.253623
3,time,0.243961
6,year,0.243961
4,people,0.115942
0,drive,0.009662
7,gun,0.004831


4
Topic  talk.politics.guns


Unnamed: 0,word,class_likelihood
4,people,0.525
7,gun,0.440625
2,well,0.384375
3,time,0.29375
6,year,0.15625
0,drive,0.0375
5,game,0.03125
1,team,0.01875
8,play,0.01875
9,games,0.009375


In [114]:
def test_precision_recall(ytrue,ypred,average = None):
    from sklearn import metrics
    if average is not None:
        precision = metrics.precision_score(ytrue,ypred,average=average)
        recall    = metrics.recall_score(ytrue,ypred,average=average)
    else:
        precision = metrics.precision_score(ytrue,ypred,average=None)
        recall    = metrics.recall_score(ytrue,ypred,average=None)
    return (precision,recall)

def test_conf_matrix(ytrue,ypred):
        from sklearn.metrics import confusion_matrix
        return confusion_matrix(ytrue, ypred)

In [115]:
class PerfMetrics:
    def __init__(self,predicted_labels, true_labels, num_classes):
        self.ypred = predicted_labels
        self.ytrue = true_labels
        self.N = num_classes
        self.cm = self.calculate_confusion_matrix()
    
    def calculate_confusion_matrix(self):
        confusion_matrix = np.zeros((self.N, self.N), dtype=int)
        for pred_label, true_label in zip(self.ypred, self.ytrue):
            confusion_matrix[true_label, pred_label] += 1
        return confusion_matrix
    
    def precision_recall(self,average):
            if average is None:
                return self.precision_recall()
            elif average == 'micro':
                return self.micro_average_precision_recall()
            elif average == 'macro':
                return self.macro_average_precision_recall()            
            elif average == 'weighted':
                return self.weighted_average_precision_recall()
            else:
                raise ValueError('bad average')
                
    def precision_recall(self):
        precision = np.zeros(self.N)
        recall = np.zeros(self.N)
        for class_idx in range(self.N):
            true_positive = self.cm[class_idx, class_idx]
            false_positive = np.sum(self.cm[:, class_idx]) - true_positive
            false_negative = np.sum(self.cm[class_idx, :]) - true_positive
            precision[class_idx] = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
            recall[class_idx] = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0

        return precision, recall

    def micro_average_precision_recall(self):
        
        total_true_positives = np.sum(np.diag(self.cm))
        total_false_positives = np.sum(self.cm.sum(axis=0) - np.diag(self.cm))
        total_false_negatives = np.sum(self.cm.sum(axis=1) - np.diag(self.cm))

        micro_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
        micro_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0

        return micro_precision, micro_recall

    def macro_average_precision_recall(self,precision, recall):
        macro_precision = np.mean(precision)
        macro_recall = np.mean(recall)
        return macro_precision, macro_recall

    def weighted_macro_average_precision_recall(self,precision, recall):
        class_counts = np.bincount(self.ytrue, minlength=self.N)
        weights = class_counts / np.sum(class_counts)

        weighted_macro_precision = np.sum(precision * weights)
        weighted_macro_recall = np.sum(recall * weights)
        return weighted_macro_precision, weighted_macro_recall



In [116]:
print('Bernoulli Classifier Question 2C')
num_test_docs = len(newsgroups_test.data)
for k,bnbc in bnbcs.items():
    print('Vocab Size', k)
    predicted_labels = []
    right=0
    for i in range(num_test_docs):
        test_doc = newsgroups_test.data[i]
        true_label = newsgroups_test.target[i]
        predicted_label = bnbc.predict(test_doc, stop_words_list)
        predicted_labels.append(predicted_label)
        if predicted_label == true_label:
            right+=1
        if False and (i % 20 == 0 and i > 0):
            print('%d right %d / %d done %f f: %f'  % (i,right, num_test_docs, i/num_test_docs, right/i))
    print('got right %d / %d %f' %  (right, num_test_docs, 1.0*right/num_test_docs))
    if False:
        print('sklearn test conf matrix %d' % (k))
        print(test_conf_matrix(newsgroups_test.target, predicted_labels))
        avgs = [None,'micro','macro', 'weighted']
        for a in avgs:
            (precision,recall) = test_precision_recall(newsgroups_test.target, predicted_labels,a)
            print('sklearn k %d a %s' % (k, a))
            print('sklearn precision')
            print(precision)
            print('sklearn recall')
            print(recall)
    pf = PerfMetrics(predicted_labels, newsgroups_test.target, bnbc.num_classes)
    print('pf confusion matrix %d' %(k))
    print(pf.cm)
    for a in avgs:
        (precision,recall) = test_precision_recall(newsgroups_test.target, predicted_labels,a)
        print('Ariel k %d a %s' % (k, a))
        print('Ariel precision')
        print(precision)
        print('Ariel recall')
        print(recall)
    


Bernoulli Classifier Question 2C
Vocab Size 10
got right 753 / 1937 0.388745
pf confusion matrix 10
[[331  12   4   4  41]
 [326   3   6   6  44]
 [158   2  78 129  30]
 [119   1  63 192  24]
 [206   1   5   3 149]]
Ariel k 10 a None
Ariel precision
[0.29035088 0.15789474 0.5        0.5748503  0.51736111]
Ariel recall
[0.84438776 0.00779221 0.19647355 0.48120301 0.40934066]
Ariel k 10 a micro
Ariel precision
0.38874548270521425
Ariel recall
0.38874548270521425
Ariel k 10 a macro
Ariel precision
0.4080914049094793
Ariel recall
0.3878394362781969
Ariel k 10 a weighted
Ariel precision
0.4082559274390201
Ariel recall
0.38874548270521425
Vocab Size 100
got right 1185 / 1937 0.611771
pf confusion matrix 100
[[221 147   8   0  16]
 [ 55 297  21   2  10]
 [  5  74 242  49  27]
 [  4  53 127 204  11]
 [  6  93  41   3 221]]
Ariel k 100 a None
Ariel precision
[0.75945017 0.44728916 0.55125285 0.79069767 0.7754386 ]
Ariel recall
[0.56377551 0.77142857 0.60957179 0.5112782  0.60714286]
Ariel k 100

In [144]:
class MClassifier:
    
    def __init__(self,data, targets, k, stop_words_list):

        self.k = k
        vocab = create_vocab(data, stop_words_list)
        most_common_k_words = vocab.most_common(k)
        self.vocab = [w for w, f in most_common_k_words]
        class_frequencies = Counter(targets)
        self.num_classes = len(class_frequencies)
        self.class_priors = [class_frequencies[class_idx] / float(self.num_classes) for class_idx in range(self.num_classes)]        

    def fit(self, data, targets,stop_words_list):
        # Create empty dictionary where the keys will be class indices and values are empty Counter objects.
        # This will store word counts for each class.
        word_counts_per_class = {class_idx: Counter() for class_idx in range(self.num_classes)}
        # Iterate through each document with its corresponding label. Tokenize each doc and then 
        # update the word counts after tokenizing
        for doc, label in zip(data, targets):
            tokens_list = tokenize_text(doc, stop_words_list)
            word_counts_per_class[label].update(tokens_list)

        self.class_word_probs = {class_idx: [] for class_idx in range(self.num_classes)}
        for class_idx in range(self.num_classes):
            total_word_count_in_class = sum(word_counts_per_class[class_idx].values())
            for word in self.vocab:
                wc = word_counts_per_class[class_idx][word]
                self.class_word_probs[class_idx].append( (wc + 1) / (total_word_count_in_class + len(self.vocab)) )

    def predict0(self,test_doc, stop_words_list):
        tokens_list = tokenize_text(test_doc, stop_words_list)
        test_document_bow = Counter(tokens_list)
        
        class_likelihoods = [0.0] * self.num_classes
        for class_idx in range(self.num_classes):
            class_likelihood = 0.0
            j = 0
            for word_prob in self.class_word_probs[class_idx]:
                word = self.vocab[j]
                word_count_in_test_doc = test_document_bow[word]
                class_likelihood += word_count_in_test_doc * np.log(word_prob)
                j=j+1
            class_likelihoods[class_idx] = class_likelihood
        predicted_topic_idx = np.argmax(class_likelihoods)
        return predicted_topic_idx

    def predict(self,test_doc, stop_words_list):
        tokens_list = tokenize_text(test_doc, stop_words_list)
        word_index = []
        for word in tokens_list:
            if word in self.vocab:
                word_index.append(self.vocab.index(word))
        #test_document_bow = Counter(tokens_list)        
        scores = np.log(self.class_priors)
        for class_idx in range(self.num_classes):
            ss = 0.0
            j = 0
            #for word_prob in self.class_word_probs[class_idx]:
            for wi in word_index:    
                word_prob = self.class_word_probs[class_idx][wi]
                ss += np.log(word_prob)
                j=j+1
            scores[class_idx] += ss
        predicted_topic_idx = np.argmax(scores)
        return predicted_topic_idx
    


In [145]:
ms = [10,100,1000, 10000]
ms = [10,100,1000,10000]
mcs = {}
for m in ms:
    mcs[m]= MClassifier(newsgroups_train.data, newsgroups_train.target, m, stop_words_list)
    mcs[m].fit(newsgroups_train.data, newsgroups_train.target, stop_words_list)


In [146]:
print('Multinomial Classifier Question 3B')
mc10 = mcs[10]
print(mc10.num_classes)
for t in range(mc10.num_classes):
    print(t)
    df = pd.DataFrame(columns=['word', 'class_likelihood'])
    df['word'] = mc10.vocab
    #print(mc10.class_word_probs[t])
    df['class_likelihood'] = mc10.class_word_probs[t]
    tops = df.sort_values(by='class_likelihood', ascending=False).head(10)
    print('Topic ', cats[t])
    display(tops)



Multinomial Classifier Question 3B
5
0
Topic  comp.sys.ibm.pc.hardware


Unnamed: 0,word,class_likelihood
0,drive,0.015196
3,time,0.004176
2,well,0.001978
4,people,0.001413
6,year,0.000565
5,game,0.000502
9,games,0.000251
8,play,0.00022
1,team,3.1e-05
7,gun,3.1e-05


1
Topic  comp.sys.mac.hardware


Unnamed: 0,word,class_likelihood
0,drive,0.008185
2,well,0.003187
3,time,0.003069
4,people,0.002322
6,year,0.001062
8,play,0.000708
9,games,0.00063
5,game,0.000236
1,team,3.9e-05
7,gun,3.9e-05


2
Topic  rec.sport.baseball


Unnamed: 0,word,class_likelihood
6,year,0.010058
5,game,0.00703
1,team,0.006669
9,games,0.005371
2,well,0.005227
3,time,0.004326
8,play,0.002992
4,people,0.002055
0,drive,0.000252
7,gun,7.2e-05


3
Topic  rec.sport.hockey


Unnamed: 0,word,class_likelihood
1,team,0.009152
5,game,0.008239
8,play,0.006703
9,games,0.005333
6,year,0.003964
3,time,0.003237
2,well,0.003134
4,people,0.001619
0,drive,8.3e-05
7,gun,4.2e-05


4
Topic  talk.politics.guns


Unnamed: 0,word,class_likelihood
7,gun,0.010329
4,people,0.008006
2,well,0.004044
3,time,0.00307
6,year,0.00168
0,drive,0.000311
5,game,0.000207
1,team,0.000166
8,play,0.000124
9,games,6.2e-05


In [147]:
print('Multinomial Classifier Question 3C')
num_test_docs = len(newsgroups_test.data)
for k,mc in mcs.items():
    print('\nVocab Size', k)
    predicted_labels = []
    right=0
    for i in range(num_test_docs):
        test_doc = newsgroups_test.data[i]
        true_label = newsgroups_test.target[i]
        predicted_label = mc.predict(test_doc, stop_words_list)
        predicted_labels.append(predicted_label)
        if predicted_label == true_label:
            right+=1
        if False and (i % 20 == 0 and i > 0):
            print('%d right %d / %d done %f f: %f'  % (i,right, num_test_docs, i/num_test_docs, right/i))
    print('vocab %d got right %d / %d %f' %  (k, right, num_test_docs, 1.0*right/num_test_docs))
    if False:
        print('sklearn confusion matrix %d' % (k))
        print(test_conf_matrix(newsgroups_test.target, predicted_labels))
        avgs = [None,'micro','macro', 'weighted']
        for a in avgs:
            (precision,recall) = test_precision_recall(newsgroups_test.target, predicted_labels,a)
            print('sklearn k %d a %s' % (k, a))
            print('sklearn precision')
            print(precision)
            print('sklearn recall')
            print(recall)
    pf = PerfMetrics(predicted_labels, newsgroups_test.target, bnbc.num_classes)
    print('Ariel confusion matrix %d' %(k))
    print(pf.cm)
    for a in avgs:
        (precision,recall) = test_precision_recall(newsgroups_test.target, predicted_labels,a)
        print('Ariel k %d a %s' % (k, a))
        print('Ariel precision')
        print(precision)
        print('Ariel recall')
        print(recall)
    
    
    
    
    
 

Multinomial Classifier Question 3C

Vocab Size 10
vocab 10 got right 618 / 1937 0.319050
Ariel confusion matrix 10
[[ 85   4  46 240  17]
 [ 48   1  53 268  15]
 [  3   1 153 227  13]
 [  0   0 127 259  13]
 [  0   0  61 183 120]]
Ariel k 10 a None
Ariel precision
[0.625      0.16666667 0.34772727 0.22005098 0.6741573 ]
Ariel recall
[0.21683673 0.0025974  0.38539043 0.64912281 0.32967033]
Ariel k 10 a micro
Ariel precision
0.31905007743933916
Ariel recall
0.31905007743933916
Ariel k 10 a macro
Ariel precision
0.4067204439650098
Ariel recall
0.3167235404381481
Ariel k 10 a weighted
Ariel precision
0.40289519474105784
Ariel recall
0.31905007743933916

Vocab Size 100
vocab 100 got right 1148 / 1937 0.592669
Ariel confusion matrix 100
[[257  87  16  29   3]
 [101 201  22  51  10]
 [ 12  11 268  94  12]
 [  9   7 150 222  11]
 [ 24  15  58  67 200]]
Ariel k 100 a None
Ariel precision
[0.63771712 0.62616822 0.52140078 0.47948164 0.84745763]
Ariel recall
[0.65561224 0.52207792 0.67506297 0.55

In [None]:
# Q 4 


# I would choose the Multinomial model, as the accuracy is slightly higher. In addition, Bermoulli isn't really meant for 
# working with non biniomial data, such as document classifier. Had this have been an email classifier where the only classifications
# would be email or spam, the binomial model would have been more appropiate.


# Q 5

# The performance of the Multinomial model is much better than the bernoulli model. Beroulli can still be used for document classification 
# and its performance isn't a lot worse. 


