## Importing the Libraries

In [0]:
import numpy as np
from os import listdir
from os.path import isfile, join
import string
import pandas as pd
import os
from nltk.corpus import stopwords

## Loading the dataset

In [2]:
import urllib.request
urllib.request.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz", "a.tar.gz")
urllib.request.urlretrieve ("http://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz", "b.tar.gz")

('b.tar.gz', <http.client.HTTPMessage at 0x7f46f6f3cf60>)

In [0]:
import tarfile
tar = tarfile.open("a.tar.gz")
tar2 = tarfile.open("b.tar.gz")
tar.extractall()
tar2.extractall()
tar.close()
tar2.close()

In [0]:
stop_words = set(stopwords.words('english'))
block_wrds = ['sender:','subject:','writes:','references:','organization:','from:','date:','>i','22','|>','>>','reply-to:','xref:','newsgroups:','>in','>the','message-id:','lines:','path:','re:','--','sender:','last','better','never','every','even','two','good','used','first','need','going','must','really','might','well','without','made','give','look','try','far','less','seem','new','make','many','way','since','using','take','help','thanks','send','free','may','see','much','want','find','would','one','like','get','use','also','could','say','us','go','please','said','set','got','sure','come','lot','seems','able','anything','put']

## Creating the BOW (Bag of Words) Model

In [0]:
dictionary = {}
count=0
for file in os.listdir("mini_newsgroups"): # making the features list by finding the frequency of each word in the docs 
    for files in os.listdir("mini_newsgroups/"+file):
        f = open("mini_newsgroups/"+file+"/"+files,'r',errors='ignore')
        message = f.read()
        message = message.split()
        k =1
        for i in message:
            count +=1
            if(len(i) > 1):
                if not i.lower() in stop_words:
                    if not i.lower() in block_wrds:
                        if(i.lower() in dictionary.keys()):
                            dictionary[i.lower()] = dictionary[i.lower()] +1
                        else:
                            dictionary[i.lower()] = 1


        f.close()

In [0]:
#sorted_vocab
import operator
sorted_vocab = sorted(dictionary.items(), key= operator.itemgetter(1), reverse= True)   # sort the vocab based on frequency

## Extracting features

In [0]:
feature_names = []
for i in range(len(sorted_vocab)):
    if(sorted_vocab[1000][1] <= sorted_vocab[i][1]):
        feature_names.append(sorted_vocab[i][0])

In [11]:
print("Number of top features =",len(feature_names))

Number of top features = 1007


## Creating the dataset

In [12]:
df = pd.DataFrame(columns=feature_names)
count=0
for file in os.listdir("20_newsgroups"):
    for files in os.listdir("20_newsgroups/"+file):
        count=count+1
        df.loc[len(df)] = np.zeros(len(feature_names))
        f = open("20_newsgroups/"+file+"/"+files,'r',errors='ignore')
        message = f.read()
        message = message.split()
        k =0
        for i in message:
            if(i.lower() in df.columns):
                df[i.lower()][len(df)-1] += 1
        f.close()
print("Total files present = ",count)

Total files present =  19997


## Labelling the output

In [0]:
y=[]
i=0
count=0
for file in os.listdir("20_newsgroups"):
    for files in os.listdir("20_newsgroups/"+file):
        count+=1
        y.append(i)
    i=i+1
    
set(y)
x = df.values

## Splitting the data for INBUILT MNB CLASSIFIER

In [0]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.25, random_state = 0)

In [0]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [18]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87       220
           1       0.83      0.70      0.76       300
           2       0.91      0.92      0.91       248
           3       0.85      0.83      0.84       247
           4       0.88      0.80      0.83       260
           5       0.88      0.68      0.77       311
           6       0.83      0.91      0.87       239
           7       0.96      0.97      0.96       268
           8       0.85      0.89      0.87       270
           9       1.00      0.92      0.96       265
          10       0.77      0.69      0.73       259
          11       0.83      0.86      0.84       228
          12       0.87      0.81      0.84       262
          13       0.86      0.93      0.89       237
          14       0.44      0.54      0.49       200
          15       0.84      0.88      0.86       243
          16       0.83      0.88      0.85       236
          17       0.93    

## Self Implemented MNB

In [0]:
df['out'] = y
Y = df['out']
X = df.iloc[:,:-1]

In [0]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [0]:
def fit(X_train,Y_train):
    result = {}
    result["total_data"] = len(Y_train)
    output_classes = set(Y_train)
    for current_class in output_classes:
        result[current_class] = {}
        
        current_class_rows = (Y_train == current_class)
        X_train_current = X_train[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        
        sum = 0
        for j in (feature_names):
            result[current_class][j] = X_train_current[j].sum()
            sum += result[current_class][j]
            
        result[current_class]["total_count"] = sum
            
    return result        
        

In [0]:
def probability(dictionary, row, current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    
    for index,count in row.iteritems():
        
        feature_name = index
        feature_count = count
        
        num = dictionary[current_class][feature_name] + 1                          #  LAPLACE CORRECTION
        den = dictionary[current_class]['total_count'] + len(feature_names)
        
        current_word_prob = np.log(num) - np.log(den)
        for i in range(int(count)):
          output += current_word_prob        
    return output

In [0]:
def predict_single_row(dictionary, row):
    classes = dictionary.keys()
    best_class = -1
    best_p = -10
    first_run = True
    
    for current_class in classes:
      
        if current_class == 'total_data':
            continue
        p_current_class = probability(dictionary,row,current_class)
        if (first_run or p_current_class > best_p):     # comapare each classes and find the best one
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class


In [0]:
def predict(dictionary,x_test):
    Y_pred = []
    for j in x_test.iterrows():
    
        x_class = predict_single_row(dictionary,j[1]) # pass each document (row) to the predict_single_row function
        Y_pred.append(x_class)

    return Y_pred

In [0]:
dictionary = fit(x_train,y_train)

In [26]:
y_pred = predict(dictionary,x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       226
           1       0.84      0.71      0.77       302
           2       0.92      0.92      0.92       248
           3       0.86      0.83      0.85       247
           4       0.89      0.81      0.84       259
           5       0.87      0.70      0.78       298
           6       0.84      0.90      0.87       242
           7       0.97      0.97      0.97       269
           8       0.85      0.89      0.87       271
           9       1.00      0.92      0.96       265
          10       0.78      0.69      0.73       262
          11       0.83      0.85      0.84       229
          12       0.87      0.81      0.84       261
          13       0.86      0.93      0.89       240
          14       0.44      0.55      0.49       195
          15       0.84      0.88      0.86       242
          16       0.83      0.88      0.85       236
          17       0.93    