Import neccessary libraries

In [1]:
import matplotlib.pyplot as plt
import copy
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
import string
import pickle
import os
import math
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords

Get stop-words and puctuations

In [2]:
stopWords = set(stopwords.words('english'))
punctuation = [' ', '!', '(', ')', '()', '-', '[', ']', '[]', '{}', '{', '}', ';', ':', '\', ', '<', '>', '.', '/', '?', '@', '#', '$', '%', '^', '&', '*', '_', '~']

In [3]:
X, Y = [], []
directory = os.listdir('20_newsgroups')

for folder in directory:
    for file in os.listdir('20_newsgroups/' + folder):
        with open('20_newsgroups/' + folder + '/' + file, "r") as f_open:
            X.append( (f_open.read()) )
            Y.append(folder)

In [4]:
from sklearn import model_selection
xTrain, xTest, yTrain, yTest = model_selection.train_test_split(X, Y, test_size = 0.25, random_state = 1)

Create a dictionary of words present in the text

In [17]:
dictionary = {}
for i in range(len(xTrain)):
    for word in xTrain[i].split():
        lowWord = word.strip(string.punctuation).lower()
        if len(lowWord) < 2:
            continue
        if lowWord in dictionary:
            dictionary[lowWord] += 1
        else:
            dictionary[lowWord] = 1

Remove stop-words and puctuations

In [18]:
for word in punctuation:
    if word in dictionary:
        del dictionary[word]
for word in stopWords:
    if word in dictionary:
        del dictionary[word]

Remove less frequent words

In [20]:
cutOff = 100
wordFeature = []
for word in dictionary.keys():
    if dictionary[word] >= cutOff:
        wordFeature.append(word)

In [21]:
len(wordFeature)

3716

First we create a training dataframe and a testing dataframe
Save them in a txt file

In [23]:
trainDataframe = np.zeros((len(xTrain), len(wordFeature)))
for i in range(len(xTrain)):
    for word in xTrain[i].split():
        lowWord = word.strip(string.punctuation).lower()
        if lowWord in wordFeature:
            trainDataframe[i][wordFeature.index(lowWord)] += 1

In [25]:
pickle_out = open('trainDataframe.txt', 'wb')
pickle.dump(trainDataframe, pickle_out)
pickle_out.close()

In [28]:
testDataframe = np.zeros( (len(xTest), len(wordFeature)))
for i in range(len(xTest)):
    for word in xTest[i].split():
        lowWord = word.strip(string.punctuation).lower()
        if lowWord in wordFeature:
            testDataframe[i][wordFeature.index(lowWord)] += 1

In [29]:
pickle_out = open('testDataframe.txt', 'wb')
pickle.dump(testDataframe, pickle_out)
pickle_out.close()

Check for GaussianNB results

In [30]:
clf = GaussianNB()
clf.fit(trainDataframe, yTrain)
yPredGauss = clf.predict(testDataframe)

In [31]:
print(classification_report(yTest, yPredGauss))

                          precision    recall  f1-score   support

             alt.atheism       0.67      0.87      0.76       261
           comp.graphics       0.63      0.69      0.66       248
 comp.os.ms-windows.misc       0.81      0.74      0.77       253
comp.sys.ibm.pc.hardware       0.72      0.68      0.70       260
   comp.sys.mac.hardware       0.76      0.83      0.80       266
          comp.windows.x       0.89      0.85      0.87       265
            misc.forsale       0.77      0.73      0.75       252
               rec.autos       0.77      0.78      0.78       223
         rec.motorcycles       0.87      0.90      0.88       293
      rec.sport.baseball       0.89      0.90      0.90       245
        rec.sport.hockey       0.91      0.93      0.92       247
               sci.crypt       0.86      0.91      0.89       248
         sci.electronics       0.75      0.80      0.78       239
                 sci.med       0.88      0.85      0.87       236
         

# Own Naive Bayes Classifier class

In [7]:
class ownNaiveBayes:
    def __init__(self):
        self.dictn = {}
        self.cls = []
        
    def fit(self, xTrain, yTrain):
        self.cls = set(yTrain)
        for cl in self.cls:
            self.dictn[cl] = {}
            for k in range(len(xTrain[0])):
                self.dictn[cl][k] = 0
            self.dictn[cl]['classCnt'] = 0
            self.dictn[cl]['totWordCnt'] = 0
        self.dictn['totalCnt'] = len(yTrain)
        
        for classIndex in range(len(xTrain)):
            for wordIndex in range(len(xTrain[0])):
                self.dictn[yTrain[classIndex]][wordIndex] += xTrain[classIndex][wordIndex]
                self.dictn[yTrain[classIndex]]['totWordCnt'] += xTrain[classIndex][wordIndex]
            self.dictn[yTrain[classIndex]]['classCnt'] += 1
        
        return
    
    def getAnsSinglePoint(self, xSingle):
        ans = ""
        maxProb = -10000000
        
        for cl in self.cls:
            curProb = np.log(self.dictn[cl]['classCnt']) - np.log(self.dictn['totalCnt']) 
            #  self.dictn[cl]['classCnt']/self.dictn['totalCnt']
            for i in range(len(xSingle)):
                curProb += ( np.log(self.dictn[cl][i] + 1) - np.log(self.dictn[cl]['totWordCnt'] + len(xSingle)) ) * xSingle[i]
            if(curProb > maxProb):
                maxProb = curProb
                ans = cl
        
        return ans
    
    def predict(self, xTest):
        yPredTest = []
        for i in range(len(xTest)):
            yPredTest.append(self.getAnsSinglePoint(xTest[i]))
        return yPredTest
              
    
    

In [6]:
pickle_in = open("trainDataframe.txt", 'rb')
trainDataframe = pickle.load(pickle_in)
pickle_in = open("testDataframe.txt", 'rb')
testDataframe = pickle.load(pickle_in)

In [8]:
clf = ownNaiveBayes()
clf.fit(trainDataframe, yTrain)

In [9]:
yPredOwn = clf.predict(testDataframe)

In [10]:
print(classification_report(yTest, yPredOwn))

                          precision    recall  f1-score   support

             alt.atheism       0.73      0.84      0.78       261
           comp.graphics       0.76      0.82      0.79       248
 comp.os.ms-windows.misc       0.87      0.84      0.85       253
comp.sys.ibm.pc.hardware       0.82      0.84      0.83       260
   comp.sys.mac.hardware       0.89      0.91      0.90       266
          comp.windows.x       0.93      0.83      0.88       265
            misc.forsale       0.81      0.89      0.85       252
               rec.autos       0.86      0.89      0.87       223
         rec.motorcycles       0.92      0.96      0.94       293
      rec.sport.baseball       0.96      0.96      0.96       245
        rec.sport.hockey       0.98      0.97      0.98       247
               sci.crypt       0.95      0.90      0.92       248
         sci.electronics       0.83      0.90      0.86       239
                 sci.med       0.97      0.90      0.94       236
         