In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from collections import Counter

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

In [3]:
stop_words = list(set(stopwords.words("english")))+[" "]+["i"]+[">"]+["subject:","from:","lines:","organisation:","|","-","would","(1)","re:","organization:","--"]

In [4]:
directories = "/home/vishal_rfx/Desktop/Machine learning codes/Text classification using naive bayes/train"

In [5]:
def getFeatures(directories,k,stop_words):
    #directories is a directory which contain the train documents and k is the top k words
    vocabulary = {}
    for directory in os.listdir(directories):
        directory = directories+"/"+directory
        for file in os.listdir(directory):
            file = directory+"/"+file
            file_obj = open(file,encoding = "ISO-8859-1")
            txt = file_obj.read()
            txt_list = txt.split()
            for word in txt_list:
                word = word.lower()
                if word not in stop_words:
                    if vocabulary.get(word)!=None:
                        vocabulary[word] += 1
                    else:
                        vocabulary[word] = 1
    counter = Counter(vocabulary)
    features = [a for a,b in counter.most_common(k)]
    features = np.array(features)
    return features


In [6]:
def prepareTrainDataset(directories,features):
    x_dataset = []
    y_dataset = []
    len_of_features = len(features)
    for directory in os.listdir(directories):
        y = directory
        directory = directories+"/"+directory
        for file in os.listdir(directory):
            y_dataset.append(y)
            file = directory+"/"+file
            file_obj = open(file,encoding = "ISO-8859-1")
            txt = file_obj.read()
            txt_list = txt.split()
            freq_data = np.zeros(len_of_features,dtype = int)
            for word in txt_list:
                word = word.lower()
                index = np.where(features == word)[0]
                if len(index) != 0:
                    index = index[0]
                    freq_data[index] += 1
                else:
                    continue
            x_dataset.append(freq_data)
        
    x_dataset = np.array(x_dataset)
    y_dataset = np.array(y_dataset)
    
    return x_dataset,y_dataset                
                
                    
                    
                
    

In [46]:
features = getFeatures(directories,10000,stop_words)

In [47]:
features.shape

(10000,)

In [48]:
x,y = prepareTrainDataset(directories,features)

In [9]:
y.shape

(11314,)

In [10]:
x.shape

(11314, 2000)

In [29]:
def fit(x,y,features):
    count = {}
    classes = np.unique(y)
    count["total_documents"] = len(y)
    for each_class in classes:
        count[each_class] = {}
        current_class_rows = np.where(y==each_class)[0]
        x_current = x[current_class_rows]
        #print(x_current)
        count[each_class]["total_documents"] = len(current_class_rows)
        total_words = 0
        for i in range(len(features)):
            no_of_words = x_current[:,i].sum()
            total_words += no_of_words
            count[each_class][features[i]] = no_of_words
        count[each_class]["total_words"] = total_words
    
    return count
            
            
            
        

In [49]:
count = fit(x,y,features)


In [31]:
count["alt.atheism"]

{'total_documents': 480,
 'writes:': 535,
 'one': 551,
 '|>': 666,
 'article': 399,
 'like': 259,
 'x': 11,
 'people': 483,
 'get': 136,
 'nntp-posting-host:': 232,
 ':': 446,
 'university': 156,
 'know': 211,
 '>>': 420,
 '*': 72,
 'think': 360,
 'use': 81,
 'new': 81,
 'also': 145,
 'could': 212,
 '1': 16,
 "i'm": 157,
 "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'": 0,
 'good': 141,
 'may': 145,
 'even': 225,
 'many': 244,
 'make': 172,
 'two': 88,
 'see': 180,
 'distribution:': 66,
 'much': 135,
 'time': 144,
 '2': 18,
 'first': 102,
 'it.': 150,
 'want': 91,
 'need': 77,
 'anyone': 80,
 'us': 163,
 'way': 149,
 'used': 83,
 'go': 76,
 '0': 3,
 'say': 268,
 'world': 113,
 'really': 113,
 'going': 74,
 'since': 119,
 'something': 170,
 'right': 84,
 '.': 48,
 'still': 94,
 'system': 121,
 'find': 89,
 'computer': 38,
 'using': 45,
 'believe': 222,
 'take': 123,
 'please': 65,
 'reply-to:': 28,
 'must': 204,
 'said': 159,
 'might': 115,
 "i've": 62,
 'last': 32,
 "ca

In [14]:
def predict(X,count,features):
    y_pred = []
    for x in X:
        y = predictOne(x,count,features)
        y_pred.append(y)
    return np.array(y_pred)

        
        

In [15]:
def predictOne(x,count,features):
    best_p = -1000
    best_class = -1
    first_run = True
    for curr_class in count.keys():
        if curr_class == "total_documents":
            continue
        else:
            p = probability(count,x,curr_class,features)
            
            if p>best_p or first_run:
                best_p = p
                best_class = curr_class
                first_run = False

    return best_class
            
            
        
            
            

In [21]:
def probability(count,x,curr_class,features):
    p = np.log(count[curr_class]["total_documents"])-np.log(count["total_documents"])
    k = len(features)
    for i in range(k):
        word = features[i]
        freq_of_word = x[i]
        while(freq_of_word):
            p_word = np.log(count[curr_class][word]+1)-np.log(count[curr_class]["total_words"]+k)
            p += p_word
            freq_of_word-=1
    
    return p
        
    
        
    

In [17]:
def prepareTestDataset(directories,features):
    x_test = []
    y_actual = []
    len_of_features = len(features)
    for directory in os.listdir(directories):
        y = directory
        directory = directories+"/"+directory
        for file in os.listdir(directory):
            y_actual.append(y)
            file = directory+"/"+file
            file_obj = open(file,encoding = "ISO-8859-1")
            txt = file_obj.read()
            txt_list = txt.split()
            freq_data = np.zeros(len_of_features,dtype = int)
            for word in txt_list:
                word = word.lower()
                index = np.where(features == word)[0]
                if len(index) != 0:
                    index = index[0]
                    freq_data[index] += 1
                else:
                    continue
            x_test.append(freq_data)
        
    x_test = np.array(x_test)
    y_actual = np.array(y_actual)
    
    return x_test,y_actual

In [33]:
test_dir = "/home/vishal_rfx/Desktop/Machine learning codes/Text classification using naive bayes/test"

In [50]:
x_test,y_actual = prepareTestDataset(test_dir,features)

In [51]:
y_pred = predict(x_test,count,features)

In [52]:
(y_pred == y_actual).sum()

5804

In [26]:
len(y_pred)

7532

In [None]:
for i in range(len(y_actual)):
    print(y_pred[i],y_actual[i])

In [None]:
np.unique(y_pred)

In [53]:
print(classification_report(y_actual,y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.64      0.70      0.67       319
           comp.graphics       0.58      0.76      0.65       389
 comp.os.ms-windows.misc       0.73      0.65      0.69       394
comp.sys.ibm.pc.hardware       0.63      0.70      0.67       392
   comp.sys.mac.hardware       0.70      0.81      0.75       385
          comp.windows.x       0.91      0.67      0.77       395
            misc.forsale       0.79      0.81      0.80       390
               rec.autos       0.79      0.86      0.83       396
         rec.motorcycles       0.82      0.92      0.87       398
      rec.sport.baseball       0.85      0.87      0.86       397
        rec.sport.hockey       0.94      0.85      0.89       399
               sci.crypt       0.91      0.85      0.88       396
         sci.electronics       0.69      0.66      0.67       393
                 sci.med       0.86      0.75      0.80       396
         

In [None]:
len(stop_words)

In [None]:
stop_words.sort()

In [None]:
stop_words

In [None]:
def log(a):
    b = np.log(a)
    return a

In [None]:
from sklearn