<a href="https://colab.research.google.com/github/b-kchr/basic-ml-course/blob/master/05_Naive_Bayes/Solution_for_Lesson_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [2]:
len(newsgroups_train.data)

11314

In [3]:
newsgroups_train.target, newsgroups_train.target_names

(array([7, 4, 4, ..., 3, 1, 8]),
 ['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc'])

In [4]:
newsgroups_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [5]:
def remove_tokens(token_list, text):
    for token in token_list:
        text = text.replace(token, '')
    return text

In [6]:
from string import punctuation
preprocessed_text = [remove_tokens(punctuation, text) for text in newsgroups_train.data]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
num_word = 30000
vectorizer = CountVectorizer(max_features=num_word)
train_data = vectorizer.fit_transform(preprocessed_text).toarray()
train_data.shape

(11314, 30000)

In [8]:
import numpy as np
classes,class_freq = np.unique(newsgroups_train.target,return_counts=True)
prior_prob = class_freq/np.sum(class_freq)
np.sum(prior_prob)

1.0

In [9]:
word_label_frequency = np.zeros((len(newsgroups_train.target_names),train_data.shape[1]))

for i in range(len(newsgroups_train.target)):
  word_label_frequency[newsgroups_train.target[i]] += train_data[i].clip(max=1)

word_label_frequency

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 5.,  0.,  0., ..., 10.,  2.,  1.],
       [ 2.,  0.,  1., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  4.,  1., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
alpha = 0.01
cond_prob = np.array([(word_label_frequency[i] + alpha)/(class_freq[i] + num_word * alpha) for i in range(len(class_freq))])
print(cond_prob.shape)

(20, 30000)


In [11]:
def find_label(data):
  data = np.array(data.toarray()).flatten()
  indices = np.where(data > 0)
  indices_0 = np.where(data==0)
  mx_val = -1e18
  result_class = 0
  for i,class_val in enumerate(cond_prob):
    prod = np.log(prior_prob[i]) + np.sum(np.log(class_val[indices])) + np.sum(np.log(1 - class_val[indices_0]))
    if prod >= mx_val:
      result_class = i
      mx_val = prod
  return result_class

In [12]:
preprocessed_test_text = [remove_tokens(punctuation, text) for text in newsgroups_test.data]
test_data = vectorizer.transform(preprocessed_test_text)

In [13]:
pred = []
from tqdm import tqdm
for text in tqdm(test_data):
    pred.append(find_label(text))

7532it [01:29, 83.71it/s]


In [14]:
from sklearn import metrics
metrics.accuracy_score(pred, newsgroups_test.target)

0.7825278810408922

In [15]:
word_label_frequency = np.zeros((len(newsgroups_train.target_names),train_data.shape[1]))

for i in range(len(newsgroups_train.target)):
  word_label_frequency[newsgroups_train.target[i]] += train_data[i]

word_label_frequency
print(word_label_frequency)

[[ 0.  0.  0. ...  0.  0.  0.]
 [18.  0.  0. ... 10.  2.  1.]
 [ 4.  0.  1. ...  0.  0.  0.]
 ...
 [ 0.  4.  1. ...  0.  0.  0.]
 [ 1.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


In [16]:
num_word_in_classes = np.sum(word_label_frequency,axis=1)
print(num_word_in_classes)

[149997. 110358.  93067. 103648.  92609. 147439.  69568. 117770. 106393.
 115963. 152844. 203722. 107589. 155324. 155745. 203120. 178699. 255412.
 187744. 120767.]


In [17]:
log_cond_prob = np.array([np.log(word_label_frequency[i] + alpha) - np.log(num_word_in_classes[i] + num_word * alpha) for i in range(len(class_freq))])
print(log_cond_prob.shape)

(20, 30000)


In [18]:
def find_label_multinom(data):
  data = np.array(data.toarray()).flatten()
  indices = np.where(data > 0)
  indices_0 = np.where(data==0)
  mx_val = -1e18
  result_class = 0
  for i,class_val in enumerate(log_cond_prob):
    prod = np.log(prior_prob[i]) + np.sum(class_val[indices]) + np.sum(np.log(1 - np.exp(class_val[indices_0])))
    if prod >= mx_val:
      result_class = i
      mx_val = prod
  return result_class

In [19]:
pred = []
for text in tqdm(test_data):
    pred.append(find_label_multinom(text))
metrics.accuracy_score(pred, newsgroups_test.target)

7532it [02:52, 43.56it/s]


0.8203664365374402