In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics

In [2]:
class MultinomialNaiveBayes():

  def __init__(self, alpha=0.01):
    self.alpha = alpha
  

  def predict(self, X):
    # (N', W) . (num_classes, num_words)
    log_likelihood = np.dot(X, self.word_conditional_log_probs.T) + self.log_class_priors
    return np.argmax(log_likelihood, axis=1)


  def get_class_priors(self, y_encoded):
    # how many documents belong to each class (num_classes, )
    self.class_counts = np.sum(y_encoded, axis=0)

    # log class priors
    self.log_class_priors =  np.log(self.class_counts) - np.log(self.class_counts.sum())

  
  def get_word_conditional_log_probs(self, X, y_encoded):
    # for each class, how many times did a particular word occur (num_classes, num_words)
    word_counts_per_class = np.dot(y_encoded.T, X)

    # smoothen word_counts_per_class (num_classes, num_words)
    wcpc_laplace = word_counts_per_class + self.alpha
    
    #for each class, how many words occured totally in all documents (num_classes, 1)
    total_wcpc = wcpc_laplace.sum(axis=1).reshape(-1, 1)

    # P(w/C) (num_classes, num_words)
    self.word_conditional_log_probs = np.log(wcpc_laplace) - np.log(total_wcpc)
  

  def fit(self, X, y):
    '''
    N: number of classes
    W: number of words in vocabulary
    '''
    self.N, self.W = X.shape
    label_binarizer = LabelBinarizer()
    y_encoded = label_binarizer.fit_transform(y)
    self.get_class_priors(y_encoded)
    self.get_word_conditional_log_probs(X, y_encoded)
    return self

## Bag of Words

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
vectorizer = CountVectorizer(max_features=50000) #TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

clf = MultinomialNaiveBayes(alpha=.005)
clf.fit(vectors.toarray(), newsgroups_train.target)
pred = clf.predict(vectors_test.toarray())
print('F1 Score : ', metrics.f1_score(newsgroups_test.target, pred, average='macro'))
print('Accuracy : ', (pred == newsgroups_test.target).mean())

F1 Score :  0.7819812580122694
Accuracy :  0.7993892724375996


In [4]:
pred = clf.predict(vectors.toarray())
print('F1 Score : ', metrics.f1_score(newsgroups_train.target, pred, average='macro'))
print('Accuracy : ', (pred == newsgroups_train.target).mean())

F1 Score :  0.9660375041002223
Accuracy :  0.9681810146720877


## TF-IDF

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
vectorizer = TfidfVectorizer() #TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

clf = MultinomialNaiveBayes(alpha=.005)
clf.fit(vectors.toarray(), newsgroups_train.target)
pred = clf.predict(vectors_test.toarray())
print('F1 Score : ', metrics.f1_score(newsgroups_test.target, pred, average='macro'))
print('Accuracy : ', (pred == newsgroups_test.target).mean())

F1 Score :  0.8270729497003412
Accuracy :  0.8325809877854488


In [6]:
pred = clf.predict(vectors.toarray())
print('F1 Score : ', metrics.f1_score(newsgroups_train.target, pred, average='macro'))
print('Accuracy : ', (pred == newsgroups_train.target).mean())

F1 Score :  0.9978309568664306
Accuracy :  0.9977903482411172
