In [1]:
import pandas as pd #for dataframe
import numpy as np #for creating numpy arrays

In [2]:
df_train = pd.read_csv("train.csv",delimiter=',') #creates dataframe from training data
df_train.head() #shows first few columns of the dataframe

Unnamed: 0,subject,lines,date,newsgroups,path,messageid,apr,organization,gmt,would,...,bay,sleep,structure,matt,tear,sig,explanation,auto,sending,y
0,1,1,1,1,1,1,1,1,1,3,...,0,0,0,0,0,0,0,0,0,talk.politics.misc
1,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,talk.religion.misc
2,1,1,1,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,sci.electronics
3,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,misc.forsale
4,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,sci.electronics


In [3]:
df_test = pd.read_csv("test.csv",delimiter=',') #creates dataframe from testing data
df_test.head() #shows first few columns of the dataframe

Unnamed: 0,subject,lines,date,newsgroups,path,messageid,apr,organization,gmt,would,...,bay,sleep,structure,matt,tear,sig,explanation,auto,sending,y
0,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,comp.windows.x
1,1,1,1,1,1,1,2,1,2,0,...,0,0,0,0,0,0,0,0,0,sci.med
2,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,talk.politics.guns
3,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,misc.forsale
4,1,1,1,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,rec.sport.hockey


In [4]:
from sklearn.naive_bayes import MultinomialNB #for using sklearn's multinomial naive bayes classifier
clf = MultinomialNB() #initializing the classifier
clf 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [5]:
#seperating x and y 
x_train = df_train.iloc[:,0:df_train.shape[1]-1]
y_train = df_train.iloc[:,df_train.shape[1]-1]
x_test = df_test.iloc[:,0:df_test.shape[1]-1]
y_test = df_test.iloc[:,df_test.shape[1]-1]
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((14997, 2000), (14997,), (5000, 2000), (5000,))

In [6]:
#creating a fit function for naive bayes 
def fit(x_train,y_train):
    result = {} #creating empty dictionary for storing result
    classes = set(y_train) #gives names of unique classes
    for c in classes:
        totalClassSum = 0 
        result[c] = {} #dictionary for the words in a particular class
        result["total_data"] = len(y_train) #stores the total length of data 
        words = x_train.shape[1] #length of the vocabulary
        x_train_current = x_train.loc[y_train == c] #x_train belonging to a certain class
        y_train_current = y_train.loc[y_train == c] #y_train belonging to a certain class
        for i in range(words):
            currentWordSum = x_train_current.iloc[:,i].sum() #sum of a particular word in a particular class
            result[c][i] = currentWordSum #stores the sum in a dictionary
            totalClassSum += currentWordSum #total number of words in a class
        result[c]["totalClassSum"] = totalClassSum #stores the total number of words in a class
        result[c]["classCount"] = len(y_train_current) #total number of data points belonging to a certain class
    return result

In [7]:
#finds the probability of predicting a particular class
def findProbability(x,dictionary,current_class):
    totalLen = dictionary["total_data"] #total length of the data
    output = np.log(dictionary[current_class]["classCount"]) - np.log(totalLen) #probability of class being "current class"
    numFeatures = len(dictionary[current_class].keys()) - 2 #words in the vocabulary
    for j in range(numFeatures):
        xj = x[j] #count of a certain word in the vocabulary
        prob_with_word_j = 0 #initializing the probability of predicting a certain class with word[j]  present present
        if(xj != 0):
            #probability with laplace correction
            wordCount = dictionary[current_class][j] + 1 #count of the number of a certain word in a class. 1 added for laplace correction
            totalClassSum = dictionary[current_class]["totalClassSum"] + numFeatures #total number of words in the class. numFeatures added for laplace correction
            prob_with_word_j = np.log(wordCount) - np.log(totalClassSum) #log probability of a word being present when we are in a certain class
        output = output + prob_with_word_j #final log probability
    return output
        

In [8]:
#predicts the class for a single data point
def predictSinglePoint(x,dictionary):
    classes = dictionary.keys() #gets the names of the unique classes
    best_class = -1 #best possible class initialized to an arbitrary negative value
    best_probability = -1000 #probability of the best possible class initialized to an arbitrary negative value
    first_run = True 
    for c in classes:
        if(c == "total_data"): 
            continue
        prob_current_class = findProbability(x,dictionary,c) #finds probability of a certain class
        if(first_run or prob_current_class > best_probability): #if the probability of the current class is greater than the best class
            #updating the values
            best_probability = prob_current_class 
            best_class = c
        first_run = False
    return best_class

In [9]:
#predict function for the NB classifier
def predict(x_test,dictionary):
    y_pred = [] #empty list of predicted classes
    for x in x_test: 
        pred_class = predictSinglePoint(x,dictionary) #gives predicted value of a certain data point
        y_pred.append(pred_class) #appends the result to the prediction list
    return y_pred

In [10]:
#self implementation for naive bayes
dictionary = fit(x_train,y_train) 
y_pred = predict(x_test.values,dictionary)


In [11]:
#classification report and confusion matrix for the naive bayes implementation
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.76      0.80      0.78       233
           comp.graphics       0.79      0.77      0.78       253
 comp.os.ms-windows.misc       0.91      0.81      0.86       249
comp.sys.ibm.pc.hardware       0.86      0.87      0.86       240
   comp.sys.mac.hardware       0.79      0.93      0.85       236
          comp.windows.x       0.91      0.82      0.86       240
            misc.forsale       0.76      0.89      0.82       261
               rec.autos       0.87      0.91      0.89       269
         rec.motorcycles       0.86      0.96      0.91       284
      rec.sport.baseball       0.96      0.98      0.97       248
        rec.sport.hockey       0.99      0.97      0.98       231
               sci.crypt       0.94      0.88      0.91       233
         sci.electronics       0.82      0.86      0.84       244
                 sci.med       0.91      0.89      0.90       256
         

In [12]:
#sklearn's implementation of naive bayes
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
y_pred

array(['comp.windows.x', 'sci.med', 'talk.politics.guns', ...,
       'rec.motorcycles', 'alt.atheism', 'rec.sport.hockey'], dtype='<U24')

In [13]:
#multinomial naive bayes using sklearn
#classification report and confusion matrix for sklearn's naive bayes implementation
print(clf.score(x_test,y_test))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.8404
                          precision    recall  f1-score   support

             alt.atheism       0.72      0.79      0.76       233
           comp.graphics       0.77      0.74      0.75       253
 comp.os.ms-windows.misc       0.84      0.80      0.82       249
comp.sys.ibm.pc.hardware       0.80      0.84      0.82       240
   comp.sys.mac.hardware       0.78      0.92      0.84       236
          comp.windows.x       0.90      0.81      0.86       240
            misc.forsale       0.75      0.87      0.80       261
               rec.autos       0.88      0.91      0.89       269
         rec.motorcycles       0.89      0.95      0.92       284
      rec.sport.baseball       0.96      0.96      0.96       248
        rec.sport.hockey       0.96      0.96      0.96       231
               sci.crypt       0.96      0.88      0.91       233
         sci.electronics       0.84      0.86      0.85       244
                 sci.med       0.91      0.88      0.90       256
  