In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("train_text.csv",delimiter=",")
df.head()

Unnamed: 0.1,Unnamed: 0,--,'AX,``,'s,cantaloupe.srv.cs.cmu.edu,n't,Subject,Date,Newsgroups,...,dog,ADL,century,talk.origins,flame,direction,2.5,wide,division,wire
0,0,26,0,91,29,2,43,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,3,4,1,5,1,1,1,...,0,0,1,0,0,0,0,0,0,0
2,2,0,0,2,3,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,3,1,0,1,0,2,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,7,5,1,5,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
del df['Unnamed: 0']

In [4]:
data = df.values
data

array([[26,  0, 91, ...,  0,  0,  0],
       [ 0,  0,  3, ...,  0,  0,  0],
       [ 0,  0,  2, ...,  0,  0,  0],
       ..., 
       [ 2,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  1, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [5]:
df_target = pd.read_csv("train_text_targets.csv",delimiter=",", dtype= int)
y_train = df_target.values
y_train = y_train.ravel()

In [6]:
x_train = data
x_train.shape, y_train.shape

((19597, 2000), (19597,))

In [7]:
def fit (x_train, y_train):
    result = {}
    class_values = set(y_train)
    for current_class in class_values:
        result[current_class] = {}
        current_class_rows = (y_train == current_class)
        x_train_current = x_train[current_class_rows]
        total_count = 0
        num_features = x_train.shape[1]
        for j in range(1, num_features+1):
            result[current_class][j] = x_train_current[:,j-1].sum()
            total_count += result[current_class][j]
        result[current_class]["total_count"] = total_count
    return result

In [9]:
def probability(dictionary, x, current_class):
    output = 1
    num_features = len(dictionary[current_class].keys())-1
    for j in range(1, num_features+1):
        if x[j-1]!=0 :
            count_current_class_current_word = dictionary[current_class][j] + 1
            count_current_class = dictionary[current_class]["total_count"] + num_features
            current_probability = np.log(count_current_class_current_word) - np.log(count_current_class)
            output = output + current_probability
    return output

In [10]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    first_run = True
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
            first_run = False
    return best_class

In [11]:
def predict(dictionary, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [12]:
df_test = pd.read_csv("test_text.csv",delimiter=",")
df_test = df_test.drop('Unnamed: 0',axis=1)

df_test_target = pd.read_csv("test_text_targets.csv",delimiter=",", dtype = int)
df_test["classes"] = df_test_target.values

In [13]:
from sklearn.utils import shuffle
df = shuffle(df_test)

In [14]:
df.head()

Unnamed: 0,--,'AX,``,'s,cantaloupe.srv.cs.cmu.edu,n't,Subject,Date,Newsgroups,Lines,...,ADL,century,talk.origins,flame,direction,2.5,wide,division,wire,classes
331,1,0,2,2,2,3,1,1,1,1,...,0,0,0,0,0,0,0,0,0,6
38,0,0,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2
285,0,0,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,5
7,0,0,1,2,1,3,1,1,1,1,...,0,0,1,0,0,0,0,0,0,1
60,0,0,0,0,2,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2


In [15]:
data = df_test.values
x_test = data[:,:2000]
y_test = data[:,2000]

x_test.shape, y_test.shape

((400, 2000), (400,))

In [16]:
dictionary = fit(x_train,y_train)
dictionary

{1: {1: 14889,
  2: 0,
  3: 8645,
  4: 5689,
  5: 4162,
  6: 5297,
  7: 2988,
  8: 2959,
  9: 2951,
  10: 2954,
  11: 2942,
  12: 2938,
  13: 3139,
  14: 2583,
  15: 2731,
  16: 2705,
  17: 3801,
  18: 1480,
  19: 3040,
  20: 2671,
  21: 2749,
  22: 3199,
  23: 1779,
  24: 2448,
  25: 1904,
  26: 1683,
  27: 1632,
  28: 2729,
  29: 1297,
  30: 1860,
  31: 1632,
  32: 871,
  33: 1171,
  34: 1936,
  35: 817,
  36: 638,
  37: 1292,
  38: 1147,
  39: 930,
  40: 1107,
  41: 1115,
  42: 1225,
  43: 843,
  44: 1047,
  45: 1153,
  46: 857,
  47: 791,
  48: 686,
  49: 2092,
  50: 1246,
  51: 1312,
  52: 1229,
  53: 994,
  54: 875,
  55: 1166,
  56: 642,
  57: 0,
  58: 698,
  59: 3978,
  60: 296,
  61: 1084,
  62: 798,
  63: 710,
  64: 303,
  65: 646,
  66: 698,
  67: 1071,
  68: 564,
  69: 646,
  70: 554,
  71: 680,
  72: 460,
  73: 940,
  74: 521,
  75: 324,
  76: 683,
  77: 510,
  78: 497,
  79: 388,
  80: 418,
  81: 904,
  82: 605,
  83: 1479,
  84: 424,
  85: 798,
  86: 1266,
  87: 258,
  8

In [17]:
y_pred = predict(dictionary, x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

             precision    recall  f1-score   support

          1       0.96      0.83      0.89        60
          2       0.98      0.56      0.71       100
          3       0.37      1.00      0.54        20
          4       0.90      0.95      0.93        80
          5       0.78      0.91      0.84        80
          6       0.81      0.80      0.81        60

avg / total       0.87      0.81      0.81       400

[[50  0  1  0  0  9]
 [ 1 56 22  4 17  0]
 [ 0  0 20  0  0  0]
 [ 0  0  3 76  0  1]
 [ 1  1  4  0 73  1]
 [ 0  0  4  4  4 48]]


COMPARISON:- This code is giving accuracy better than that of sklearn.