<a href="https://colab.research.google.com/github/mpily/basic-ml-course/blob/Lesson_05/Solution5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [2]:
len(newsgroups_train.data)


11314

In [3]:
newsgroups_train.target, newsgroups_train.target_names


(array([7, 4, 4, ..., 3, 1, 8]),
 ['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc'])

In [4]:
newsgroups_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [5]:
def remove_tokens(token_list, text):
    for token in token_list:
        text = text.replace(token, '')
    return text

In [6]:
from string import punctuation
preprocessed_text = [remove_tokens(punctuation, text) for text in newsgroups_train.data]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
num_word = 30000
vectorizer = CountVectorizer(max_features=num_word)
train_data = vectorizer.fit_transform(preprocessed_text).toarray()
print(train_data.shape)

(11314, 30000)


In [8]:

classes,class_freq = np.unique(newsgroups_train.target,return_counts=True)
prior_prob = class_freq/np.sum(class_freq)
np.sum(prior_prob)

1.0

In [9]:
count = 0
for i,b in enumerate(train_data[0]):
  if b:
    count += 1

print(count)


82


In [10]:
word_label_frequency = np.zeros((len(newsgroups_train.target_names),train_data.shape[1]))

for i in range(len(newsgroups_train.target)):
  word_label_frequency[newsgroups_train.target[i]] += train_data[i].clip(max=1)

word_label_frequency
print(word_label_frequency)

[[ 0.  0.  0. ...  0.  0.  0.]
 [ 5.  0.  0. ... 10.  2.  1.]
 [ 2.  0.  1. ...  0.  0.  0.]
 ...
 [ 0.  4.  1. ...  0.  0.  0.]
 [ 1.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


The conditional probability is computed by dividing the number of documents which has word and label to the number of documents with label . However, if there is no document which has word and label in training data, the probability will be zero, which is undesirable(Recall that the probability is given by product so if one value is zero then the whole product is zero... So a test that comes with an extra word will still fail even though it is most likely to be in a given class).
To handle this problem, we can apply Laplace smoothing, then conditional probability will be computed as following :
$$p(x_{i} = 1|c) = \frac{N_{ic} + \alpha}{N_{c} + |V|\alpha}$$

In [11]:
alpha = 0.01
cond_prob = np.array([(word_label_frequency[i] + alpha)/(class_freq[i] + num_word * alpha) for i in range(len(class_freq))])
print(cond_prob.shape)

(20, 30000)


Some optimisation:


Observe that the rows of the matrix correspond to different classes and the columns to different words.
The task is thus to multiply all the elements in the row where the corresponding word in data is not zero.
We can use the fact that numpy multiplication is optimized. For all values $i$ in data where $data[i]$ is 0 convert to 1/



In [12]:
def find_label(data):
  data = np.array(data.toarray()).flatten()
  indices = np.where(data > 0)
  indices_0 = np.where(data==0)
  #print(indices)
  #print(data.shape)
  mx_val = -1e18
  result_class = 0
  for i,class_val in enumerate(cond_prob):
    prod = np.log(prior_prob[i]) + np.sum(np.log(class_val[indices])) + np.sum(np.log(1 - class_val[indices_0]))
    if prod >= mx_val:
      result_class = i
      mx_val = prod
  return result_class      


In [13]:
preprocessed_test_text = [remove_tokens(punctuation, text) for text in newsgroups_test.data]
test_data = vectorizer.transform(preprocessed_test_text)

In [14]:
print(test_data.toarray())
count = 0
for i,b in enumerate(test_data.toarray()[0]):
  if b:
    count += 1

print(count)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
75


In [15]:
pred = []
from tqdm import tqdm
for text in tqdm(test_data):
    pred.append(find_label(text))

7532it [01:24, 88.63it/s]


In [16]:
from sklearn import metrics
print(newsgroups_test.target)
print(pred)
metrics.accuracy_score(pred, newsgroups_test.target)

[ 7  5  0 ...  9  6 15]
[7, 1, 0, 17, 0, 13, 6, 2, 5, 1, 2, 1, 17, 8, 15, 3, 4, 1, 6, 16, 17, 6, 17, 14, 3, 13, 11, 7, 7, 3, 5, 1, 4, 2, 14, 1, 9, 4, 6, 1, 17, 1, 8, 1, 11, 1, 14, 3, 11, 11, 8, 8, 8, 9, 6, 9, 10, 17, 16, 14, 8, 10, 17, 18, 3, 18, 18, 13, 0, 9, 8, 6, 10, 19, 3, 7, 16, 7, 18, 9, 6, 18, 17, 4, 12, 10, 16, 15, 8, 3, 14, 12, 16, 15, 16, 15, 9, 3, 3, 16, 2, 10, 14, 15, 3, 16, 10, 4, 14, 12, 8, 3, 17, 12, 8, 14, 9, 5, 9, 17, 12, 4, 4, 5, 9, 13, 16, 8, 3, 1, 16, 11, 6, 13, 10, 2, 5, 1, 3, 12, 10, 14, 7, 7, 10, 5, 10, 12, 0, 13, 14, 4, 15, 4, 6, 14, 18, 2, 10, 1, 11, 17, 9, 2, 12, 8, 2, 16, 3, 7, 1, 7, 7, 8, 4, 12, 10, 18, 10, 18, 4, 9, 1, 0, 3, 16, 8, 4, 8, 14, 18, 11, 13, 16, 3, 15, 16, 13, 3, 19, 15, 7, 4, 6, 15, 12, 9, 12, 16, 8, 2, 14, 14, 6, 16, 2, 1, 10, 2, 19, 7, 16, 11, 6, 14, 2, 7, 15, 5, 7, 18, 1, 2, 17, 15, 4, 15, 9, 0, 7, 3, 7, 8, 10, 14, 12, 14, 12, 11, 8, 15, 6, 4, 11, 7, 17, 7, 8, 10, 3, 4, 2, 5, 8, 1, 12, 18, 5, 13, 12, 8, 16, 16, 8, 4, 15, 5, 3, 11, 3, 15, 10,

0.7825278810408922

In [17]:
word_label_frequency = np.zeros((len(newsgroups_train.target_names),train_data.shape[1]))

for i in range(len(newsgroups_train.target)):
  word_label_frequency[newsgroups_train.target[i]] += train_data[i]

word_label_frequency
print(word_label_frequency)

[[ 0.  0.  0. ...  0.  0.  0.]
 [18.  0.  0. ... 10.  2.  1.]
 [ 4.  0.  1. ...  0.  0.  0.]
 ...
 [ 0.  4.  1. ...  0.  0.  0.]
 [ 1.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


In [18]:
num_word_in_classes = np.sum(word_label_frequency,axis=1)
print(num_word_in_classes)

[149997. 110358.  93067. 103648.  92609. 147439.  69568. 117770. 106393.
 115963. 152844. 203722. 107589. 155324. 155745. 203120. 178699. 255412.
 187744. 120767.]


In [19]:
log_cond_prob = np.array([np.log(word_label_frequency[i] + alpha) - np.log(num_word_in_classes[i] + num_word * alpha) for i in range(len(class_freq))])
print(log_cond_prob.shape)

(20, 30000)


In [41]:
def find_label_multinom(data):
  data = np.array(data.toarray()).flatten()
  indices = np.where(data > 0)
  indices_0 = np.where(data==0)
  #print(indices)
  #print(data.shape)
  mx_val = -1e18
  result_class = 0
  for i,class_val in enumerate(log_cond_prob):
    prod = np.log(prior_prob[i]) + np.sum(class_val[indices]) + np.sum(np.log(1 - np.exp(class_val[indices_0])))
    if prod >= mx_val:
      result_class = i
      mx_val = prod
  return result_class  


In [21]:
pred = []
for text in tqdm(test_data):
    pred.append(find_label_multinom(text))
metrics.accuracy_score(pred, newsgroups_test.target)


7532it [02:39, 47.22it/s]


0.8203664365374402

Some Ideas to improve accuracy:

1.  Get rid of articles and prepositions.
2.  Get rid of words that are common in many classes (Say if a word exists in over 80% of classes). 
