In [59]:
#importing necessary packages
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection
from sklearn.datasets import fetch_20newsgroups
import os
import pandas as pd
import numpy as np

In [24]:
#saving urls info in tar file
import urllib.request
urllib.request.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz", "file.tar.gz")

('file.tar.gz', <http.client.HTTPMessage at 0x220c941db00>)

In [26]:
#extracting contents of tar file
import tarfile
tar = tarfile.open("file.tar.gz")
tar.extractall()
tar.close()

In [27]:
#generating list of stopwords

from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [29]:
directory = [f for f in os.listdir('./20_newsgroups')]
directory

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [32]:
#add all words and their counts to dictionary 'words'
words = {}

for d in directory:
    files = os.listdir('./20_newsgroups/' + d)
    
    for f in files:
        path = './20_newsgroups/' + d + '/' + f
        text = open(path, 'r', errors='ignore').read()
        for word in text.split():
            word = word.lower()
            #only alphabetical words valid
            n = len(word)
            if (word.isalpha() == False and word[0:n-1].isalpha() == False):  
                continue
            if (word[n-1] == ':'):
                continue
            if (word[n-1].isalpha() == False):
                word = word[0:n-1]
            if word not in stops:
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1
        

In [33]:
#sort words according to frequency
import operator
sorted_x= sorted(words.items(), key=operator.itemgetter(1))
sorted_x.reverse()
words2 = sorted_x

In [39]:
new_words = words2[0:3002]

In [40]:
#convert new words list back to dictionary

words_dict = {}
for t in new_words:
    words_dict[t[0]] = t[1]
    
#words_dict


In [41]:
#delete irrelevant words

del(words_dict['gmt'])
del(words_dict['apr'])
#words_dict

In [49]:
alphas=['A', 'B', 'C']
df = pd.DataFrame(columns=alphas)
arr = np.zeros(3)
ser = pd.Series(arr, index = alphas)
print(ser)
df = df.append(ser, ignore_index=True)
df

A    0.0
B    0.0
C    0.0
dtype: float64


Unnamed: 0,A,B,C
0,0.0,0.0,0.0


In [50]:
#converting dictionary and frequencies to data frame

word_list = list(words_dict.keys())
df = pd.DataFrame(columns = word_list)     #dataframe for storing dictionary of words
Y = []
i = 0
for d in directory:
    files = os.listdir('./20_newsgroups/' + d)
    
    for f in files:
        arr = np.zeros(len(word_list))
        ser = pd.Series(arr, index = word_list)   #need to convert to series to append to datafame
        df = df.append(ser, ignore_index=True)
        path = './20_newsgroups/' + d + '/' + f
        text = open(path,  'r', errors='ignore').read()
        for word in text.split():                 #for each word, increase count if present in word list
            if word.lower() in word_list:
                df.loc[len(df) - 1, word.lower()] += 1
                
        Y.append(i)
        
    i = i+1
                

In [51]:
set(Y)
y = np.array(Y)
X = df.values

In [52]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, y)

In [53]:

#function to fit training examples
def fit(X_train, Y_train):
    result = {}         #dictionary to store counts of each word for particular class
    class_values = set(Y_train)   #possible class values
    for current_class in class_values:
        result[current_class] = {}
        result["total_data"] = len(Y_train)   
        current_class_rows = (Y_train == current_class)   #choose only rows which belong to current class
        X_train_current = X_train[current_class_rows]    #pick X and Y for these rows
        Y_train_current = Y_train[current_class_rows]
        num_words = len(word_list)           
        result[current_class]["total_count"] = 0    #total number of words belonging to current class
        for j in range(num_words):
            result[current_class][word_list[j]] = (X_train_current[:, j]).sum()    #add frequencies of word j in current class
            result[current_class]["total_count"] += (X_train_current[:, j]).sum() 
            
                
    return result


def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    for j in range(len(word_list)):

        num = dictionary[current_class][word_list[j]] + 1   #count of word in current class
        den = dictionary[current_class]["total_count"] + len(word_list)   #count of all words in current_class
        current_word_probablity = np.log(num) - np.log(den)  #divide - use log
        #for current testing example, multiply probability with number of times the word appears and add to answer
        output = output + current_word_probablity*x[j]
    return output

def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        #find class which maximises total probability for testing example
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

#main function to predict class for given testing dataset and dictionary computed in fit.
def predict(dictionary, X_test):
    y_pred = []
    for x in X_test:
        #for each testing example
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred



In [54]:
#fit training examples and predict for test
dict_train = fit(X_train, Y_train)
y_pred = predict(dict_train, X_test)

In [56]:
y_pred

[5,
 18,
 1,
 17,
 8,
 13,
 19,
 12,
 2,
 11,
 17,
 15,
 19,
 7,
 15,
 14,
 5,
 0,
 0,
 10,
 4,
 9,
 8,
 17,
 16,
 19,
 4,
 16,
 11,
 13,
 0,
 2,
 15,
 18,
 11,
 8,
 12,
 15,
 16,
 2,
 17,
 1,
 9,
 8,
 5,
 18,
 17,
 1,
 13,
 19,
 2,
 7,
 3,
 12,
 15,
 16,
 12,
 3,
 18,
 19,
 12,
 7,
 3,
 7,
 3,
 9,
 5,
 8,
 12,
 10,
 2,
 1,
 15,
 11,
 18,
 16,
 16,
 18,
 16,
 16,
 12,
 12,
 7,
 9,
 3,
 1,
 5,
 8,
 3,
 2,
 13,
 9,
 19,
 0,
 1,
 16,
 5,
 3,
 13,
 13,
 6,
 15,
 15,
 1,
 10,
 16,
 16,
 10,
 19,
 2,
 9,
 16,
 7,
 16,
 10,
 13,
 13,
 16,
 17,
 19,
 8,
 10,
 17,
 17,
 10,
 0,
 4,
 4,
 1,
 16,
 7,
 18,
 0,
 9,
 17,
 18,
 1,
 12,
 3,
 10,
 13,
 2,
 15,
 9,
 6,
 4,
 18,
 11,
 5,
 0,
 18,
 6,
 18,
 0,
 12,
 10,
 0,
 16,
 0,
 2,
 7,
 15,
 16,
 18,
 16,
 7,
 11,
 9,
 0,
 2,
 4,
 17,
 12,
 15,
 16,
 19,
 12,
 2,
 9,
 19,
 13,
 9,
 17,
 3,
 9,
 11,
 13,
 17,
 9,
 7,
 6,
 15,
 6,
 10,
 7,
 7,
 11,
 4,
 18,
 13,
 1,
 19,
 13,
 15,
 5,
 16,
 13,
 6,
 16,
 0,
 4,
 1,
 15,
 18,
 13,
 6,
 2,
 18,
 2,
 6,
 

In [57]:
#accuracucy for self-coded classifier
accuracy_score(Y_test, y_pred)

0.767

In [65]:
print(classification_report(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.72      0.67       236
           1       0.67      0.66      0.67       242
           2       0.65      0.74      0.69       231
           3       0.68      0.68      0.68       259
           4       0.75      0.75      0.75       246
           5       0.87      0.73      0.80       252
           6       0.78      0.77      0.77       264
           7       0.77      0.82      0.79       231
           8       0.82      0.88      0.85       234
           9       0.91      0.91      0.91       277
          10       0.92      0.90      0.91       271
          11       0.89      0.90      0.89       226
          12       0.70      0.76      0.72       249
          13       0.89      0.81      0.85       263
          14       0.86      0.86      0.86       240
          15       0.78      0.84      0.81       262
          16       0.69      0.84      0.76       270
          17       0.85    

In [62]:
#USING SKlearn INBUILT CLASSIFIER

clf = MultinomialNB()
clf.fit(X_train, Y_train)
y_pred2 = clf.predict(X_test)
accuracy_score(Y_test, y_pred2)

0.7656

In [None]:
#Thus accuracy for self coded classifier is very slightly higher than multinomial NB Classifier.
#As we can see below, classification report is also nearly same.

In [66]:
print(classification_report(Y_test, y_pred2))
print(confusion_matrix(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.72      0.67       236
           1       0.68      0.66      0.67       242
           2       0.65      0.77      0.70       231
           3       0.68      0.68      0.68       259
           4       0.74      0.74      0.74       246
           5       0.89      0.73      0.80       252
           6       0.75      0.77      0.76       264
           7       0.77      0.81      0.79       231
           8       0.80      0.89      0.84       234
           9       0.89      0.91      0.90       277
          10       0.93      0.88      0.91       271
          11       0.91      0.89      0.90       226
          12       0.68      0.76      0.71       249
          13       0.90      0.81      0.86       263
          14       0.88      0.86      0.87       240
          15       0.78      0.83      0.80       262
          16       0.69      0.84      0.76       270
          17       0.87    