In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import numpy as np
from scipy.sparse import find

# Feel free to import any standard libraries that you may need to complete the program.

**Step1: ** Fetch the dataset for the three aforementioned categories using scikit-learn library.

In [2]:
categories = ['talk.religion.misc','comp.graphics','sci.space']

num_categories = len(categories)

#Loading training data

data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

#Loading testing data

data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

# Loading the class labels for training and testing data

y_train, y_test = data_train.target, data_test.target

In [3]:
# Total number of documents in train and test datasets

num_train = len(data_train.target)
num_test = len(data_test.target)

print("Dataset contatins \n "
       +str(num_train)+" train documents, \n "
       + str(num_test) + " test documents." )

Dataset contatins 
 1554 train documents, 
 1034 test documents.


Now, let's print a sample document to understand the dataset better.

Fill in the cell below to print contents of the first document from "train" subset. Also, print its corresponding class label name(category).

**Hint:** Use "data_train.data" variable

In [4]:
data_train.data[0]

"From: nicho@vnet.IBM.COM (Greg Stewart-Nicholls)\nSubject: Re: Biosphere II\nReply-To: nicho@vnet.ibm.com\nDisclaimer: This posting represents the poster's views, not those of IBM\nNews-Software: UReply 3.1\nX-X-From: nicho@vnet.ibm.com\n            <1q1kia$gg8@access.digex.net>\nLines: 18\n\nIn <1q1kia$gg8@access.digex.net> Pat writes:\n>In article <19930408.043740.516@almaden.ibm.com> nicho@vnet.ibm.com writes:\n>>In <1q09ud$ji0@access.digex.net> Pat writes:\n>>>Why is everyone being so critical of B2?\n>> Because it's bogus science, promoted as 'real' science.\n>It seems to me, that it's sorta a large engineering project more\n>then a science project.\n  Bingo.\n>B2 is not bench science,  but rather a large scale attempt to\n>re-create a series of micro-ecologies.   what's so eveil about this?\n Nothing evil at all. There's no actual harm in what they're doing, only\nhow they represent it.\n\n -----------------------------------------------------------------\n .sig files are like s

In [5]:
 y_train[0]

1

**Step2:** Remove stop words and create count vectors for the train and test datasets.

   We use the CountVectorizer method to extract features (counts for each word). Note that words from both training and testing data are needed to build the count table.
   
   *Documentation:*  http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [6]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(data_train.data + data_test.data)
x_train = vectorizer.transform(data_train.data)
x_test = vectorizer.transform(data_test.data)    

In [8]:
V = x_train.shape[1]

In [9]:
class0 = np.where(y_train == 0)
class1 = np.where(y_train == 1)
class2 = np.where(y_train == 2)

# 2. Building the classifier

In [10]:
x0 = x_train[class0]
x1 = x_train[class1]
x2 = x_train[class2]

In [11]:
N0 = np.where(x0.toarray()>0)
N1 = np.where(x1.toarray()>0)
N2 = np.where(x2.toarray()>0)

n0 = x0[N0]
n1 = x1[N1]
n2 = x2[N2]

Z0 = np.asarray(n0).sum(axis=1)
Z1 = np.asarray(n1).sum(axis=1)
Z2 = np.asarray(n2).sum(axis=1)


NC = [Z0, Z1, Z2]

In [12]:
c = np.zeros((num_categories,x_train.shape[1]))


for i in range(x0.shape[1]):
    c[0,i] = np.asarray(x0[:,i].toarray()).sum(axis=0)        
   

In [13]:
for i in range(x0.shape[1]):
    c[1,i] = np.asarray(x1[:,i].toarray()).sum(axis=0)   

In [14]:
for i in range(x0.shape[1]):
    c[2,i] = np.asarray(x2[:,i].toarray()).sum(axis=0)   

Now, let's build a Multinomial Naive Bayes classifier that takes feature vector from the test data as input and classifies as one of the three classes ('talk.religion.misc','comp.graphics','sci.space').

Complete the training function MultiNB_train() in the cell below to train a Multiomial Naive Bayes classifier that takes "x_train","y_train","alpha" as inputs and returns the likelihood probability matrix "theta" and the prior distribution  "prior" on the document category.

"prior" is a vector of length equal to num_categories where the $i$-th element is defined as
$$ prior (i) = \frac{\text{ # of train documents with category i}}{\text{Total number of train documets}} $$

"theta" ($\theta$) is the  matrix with the $(c,i)$th element defined by

 $$ \theta(c,i) = P(w_i/c) =  \frac{N_{ci} + \alpha }{N_c + |V| \alpha}$$
 
 where,
 * $P(w_i/c)$ refers to the probability of seeing the $i$th word in the vocabulary given that class type is $c$.
 * $N_{ci}$ refers to the total number of times the word  $i$ appeared in the training documents of class type $c$.
 * $N_c$ is the total number of words in the documents of type $c$
    $$N_c = \sum_{d \in T[c]} N_{cd}$$
    where, $T[c]$ refers to the documents of type $c$.
 * $|V|$ is the size of the vocabulary.
 * $\alpha$ is the laplace smoothing parameter

***Note**: **Do NOT** use the scikit-learn's inbuilt function "MultinomialNB" . Write your own code to build the classifier. You may use standary libraries like "numpy","scipy" etc. to perform operations on matrices/arrays. 

Feel free to break your code into multiple functions or cells.

In [15]:
def MultiNB_train(x_train,y_train, alpha):
    prior = np.bincount(y_train)/num_train
    theta = np.zeros((num_categories,x_train.shape[1]))
    for j in range (num_categories):
        for i in range(x_train.shape[1]):
            theta[j,i] = (c[j,i] + alpha)/(NC[j] + (V*alpha)) 
        
    return(theta, prior)

Now, let us train the model to learn the likelihood parameters $\theta$

In [61]:
theta, prior = MultiNB_train(x_train,y_train,alpha = 0.58)

Complete the classifier function MultiNB_classify() below that takes in features of one test sample (one row from x_test) and returns the predicted class "pred_class" $\in \{0,1,2\}$. 

In [17]:
def MultiNB_classify(x_test_sample, theta, prior):
    
    
        id = np.where(x_test_sample.toarray() > 0)
        p = x_test_sample[id]
       
        r0 = np.log(theta[0,id[1]])
        r1 = np.log(theta[1,id[1]])
        r2 = np.log(theta[2,id[1]])
        r = np.matrix([r0, r1, r2])
        g = np.matmul(r, p.transpose())
        P = g.sum(axis=1)
        
        
        
        
        P0 = np.log(prior[0])+P[0]
        P1 = np.log(prior[1])+P[1]
        P2 = np.log(prior[2])+P[2]
        u = np.array([P0, P1, P2])
        pred_class = np.argmax(u)

    

        return pred_class
    
   

   

Let us test our classifier on the first sample of testing dataset.

In [62]:
pred_class = MultiNB_classify(x_test.getrow(0),theta, prior)

print("predicted class:" + str(pred_class))
print("actual class:" + str(y_test[0]))

predicted class:0
actual class:0


# 3. Evaluating the classifier

The following code below runs your classifier on every data sample from the testing dataset and stored them in "y_pred".

In [63]:
y_pred = []
for i in range(num_test):
    pred_class = MultiNB_classify(x_test.getrow(i),theta=theta, prior= prior)
    y_pred.append(pred_class)

The following cell evaluates your result by comparing it with the test labels.

In [64]:
score = metrics.accuracy_score(y_test,y_pred)

print("accuracy: %0.3f" % score)
print(metrics.classification_report(y_test,y_pred))

accuracy: 0.958
             precision    recall  f1-score   support

          0       0.95      0.96      0.96       389
          1       0.96      0.95      0.96       394
          2       0.97      0.96      0.96       251

avg / total       0.96      0.96      0.96      1034



Find the classification error (1-score) over the test set for various values of the smoothing parameter α and by trial and error find a good value of α.

In [56]:
print('A good alpha value is between 0.4 to 0.7')

A good alpha value is between 0.4 to 0.7
