In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics

In [2]:
class MultinomialNaiveBayes():

  def __init__(self, alpha=0.01):
    self.alpha = alpha
  

  def predict(self, X):
    # (N', W) . (num_classes, num_words)
    log_likelihood = np.dot(X, self.word_conditional_log_probs.T) + self.log_class_priors
    return np.argmax(log_likelihood, axis=1)


  def get_class_priors(self, y_encoded):
    # how many documents belong to each class (num_classes, )
    self.class_counts = np.sum(y_encoded, axis=0)

    # log class priors
    self.log_class_priors =  np.log(self.class_counts) - np.log(self.class_counts.sum())

  
  def get_word_conditional_log_probs(self, X, y_encoded):
    # for each class, how many times did a particular word occur (num_classes, num_words)
    word_counts_per_class = np.dot(y_encoded.T, X)

    # smoothen word_counts_per_class (num_classes, num_words)
    wcpc_laplace = word_counts_per_class + self.alpha
    
    #for each class, how many words occured totally in all documents (num_classes, 1)
    total_wcpc = wcpc_laplace.sum(axis=1).reshape(-1, 1)

    # P(w/C) (num_classes, num_words)
    self.word_conditional_log_probs = np.log(wcpc_laplace) - np.log(total_wcpc)
  

  def fit(self, X, y):
    '''
    N: number of classes
    W: number of words
    '''
    self.N, self.W = X.shape
    label_binarizer = LabelBinarizer()
    y_encoded = label_binarizer.fit_transform(y)
    self.get_class_priors(y_encoded)
    self.get_word_conditional_log_probs(X, y_encoded)
    return self

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
vectorizer = CountVectorizer(max_features=10000) #TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

clf = MultinomialNaiveBayes(alpha=.005)
clf.fit(vectors.toarray(), newsgroups_train.target)
pred = clf.predict(vectors_test.toarray())
print('F1 Score : ', metrics.f1_score(newsgroups_test.target, pred, average='macro'))
print('Accuracy : ', (pred == newsgroups_test.target).mean())

F1 Score :  0.7520107714903224
Accuracy :  0.7720392989909719


In [5]:
pred = clf.predict(vectors.toarray())
print('F1 Score : ', metrics.f1_score(newsgroups_train.target, pred, average='macro'))
print('Accuracy : ', (pred == newsgroups_train.target).mean())

F1 Score :  0.9113979028339518
Accuracy :  0.9244299098462082


In [4]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [8]:
# train test split size
a, b = vectors.shape, vectors_test.shape
a, b

((11314, 10000), (7532, 10000))

In [9]:
# total dataset size
a[0] + b[0]

18846

In [14]:
pd.Series(newsgroups_train.target_names)

0                  alt.atheism
1                comp.graphics
2      comp.os.ms-windows.misc
3     comp.sys.ibm.pc.hardware
4        comp.sys.mac.hardware
5               comp.windows.x
6                 misc.forsale
7                    rec.autos
8              rec.motorcycles
9           rec.sport.baseball
10            rec.sport.hockey
11                   sci.crypt
12             sci.electronics
13                     sci.med
14                   sci.space
15      soc.religion.christian
16          talk.politics.guns
17       talk.politics.mideast
18          talk.politics.misc
19          talk.religion.misc
dtype: object

In [35]:
y = pd.Series(newsgroups_train.target)
df = pd.DataFrame(y.value_counts())
df.reset_index(inplace=True)
df.columns = ['class', 'number of documents']
px.bar(df, x='class', y='number of documents')

fig = go.Figure(data=[go.Bar(
            x=df['class'], y=df['number of documents'],
            text=df['number of documents'],
            textposition='auto',
            marker={'color': df['class']}
        )])
fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1
    ),
    title='Number of documents per class (train split)'
)
fig.show()

In [45]:
y = pd.Series(newsgroups_test.target)
df = pd.DataFrame(y.value_counts())
df.reset_index(inplace=True)
df.columns = ['class', 'number of documents']
px.bar(df, x='class', y='number of documents')

fig = go.Figure(data=[go.Bar(
            x=df['class'], y=df['number of documents'],
            text=df['number of documents'],
            textposition='auto',
            marker={'color': df['class']},
            
        )])
fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1
    ),
    title='Number of documents per class (test split)',
    template='plotly'
)
fig.show()

In [15]:
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go

cm = confusion_matrix(pred, newsgroups_test.target)
labels = newsgroups_train.target_names

data = go.Heatmap(z=cm, y=labels, x=labels)
annotations = []
for i, row in enumerate(cm):
    for j, value in enumerate(row):
        annotations.append(
            {
                "x": labels[i],
                "y": labels[j],
                "font": {"color": "white"},
                "text": str(value),
                "xref": "x1",
                "yref": "y1",
                "showarrow": False
            }
        )
layout = {
    "title": 'Newsgroup 20 Multinomial Naive Bayes predictions confusion matrix',
    "xaxis": {"title": "Predicted value"},
    "yaxis": {"title": "Real value"},
    "annotations": annotations,
    "width": 950,
    "height": 750,
}
fig = go.Figure(data=data, layout=layout)
fig.show()

## TF-IDF

In [25]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
vectorizer = TfidfVectorizer() #TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

clf = MultinomialNaiveBayes(alpha=.005)
clf.fit(vectors.toarray(), newsgroups_train.target)
pred = clf.predict(vectors_test.toarray())
print('F1 Score : ', metrics.f1_score(newsgroups_test.target, pred, average='macro'))
print('Accuracy : ', (pred == newsgroups_test.target).mean())

F1 Score :  0.8270729497003412
Accuracy :  0.8325809877854488


In [26]:
cm = confusion_matrix(pred, newsgroups_test.target)
labels = newsgroups_train.target_names

data = go.Heatmap(z=cm, y=labels, x=labels)
annotations = []
for i, row in enumerate(cm):
    for j, value in enumerate(row):
        annotations.append(
            {
                "x": labels[i],
                "y": labels[j],
                "font": {"color": "white"},
                "text": str(value),
                "xref": "x1",
                "yref": "y1",
                "showarrow": False
            }
        )
layout = {
    "title": 'Newsgroup 20 Multinomial Naive Bayes predictions confusion matrix',
    "xaxis": {"title": "Predicted value"},
    "yaxis": {"title": "Real value"},
    "annotations": annotations,
    "width": 950,
    "height": 750,
}
fig = go.Figure(data=data, layout=layout)
fig.show()