In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
data='20_newsgroups'

In [3]:
categories= os.listdir(os.path.join(os.getcwd(),data))

In [4]:
categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
print(len(categories))

20


In [6]:
data_dict={}
for type in categories:
    data_dict[type]=[]
    for file in os.listdir(os.path.join(data,type)):
         with open(os.path.join(data,type,file),encoding='latin-1') as opened_file:
            data_dict[type].append(opened_file.read())
print(len(data_dict[categories[1]]))

1000


In [7]:
folders=[i for i in os.listdir(data)]

In [8]:
files = []
for name in folders:
    path = os.path.join(data, name)
    files.append([f for f in os.listdir(path)])

In [9]:
x = []
for fo in range(len(folders)):
    for fi in files[fo]:
        x.append(os.path.join(data, os.path.join(folders[fo], fi)))
len(x)

19997

In [10]:
y = []
for name in folders:
    path = os.path.join(data, name)
    num_of_files= len(os.listdir(path))
    for i in range(num_of_files):
        y.append(name)
len(y)

19997

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.5)

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from string import punctuation


In [13]:
def preprocessing(text):
    tokens=word_tokenize(text)
    stop_words=set(stopwords.words('english'))
    punctuations = set(punctuation)
    stop_words.update(punctuations)
    stop_words.add('subject')  
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word.lower()) for word in tokens if word.lower() not in stop_words]
    return tokens
    

In [14]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [15]:
def sentence_tokenize(line):
    from nltk import word_tokenize
    tokens = word_tokenize(line)
    tokens = preprocessing(tokens)
    tokens = remove_stopwords(tokens)
    
    return tokens

In [16]:
def remove_metadata(lines):
    for i in range(len(lines)):
        if(lines[i] == '\n'):
            start = i+1
            break
    return lines[start:]

In [17]:
def tokenize_and_preprocess(path):
    with open(path, 'r') as f:
        text_lines = f.readlines()
        text_lines = remove_metadata(text_lines)
        doc_words = [preprocessing(line) for line in text_lines]
    return doc_words

In [18]:
def flatten(lst):
    flatten_list = [j for i in lst for j in i]
    return flatten_list

In [20]:
X_tokens = []

for document in x_train:
        X_tokens.append(flatten(tokenize_and_preprocess(document)))

In [21]:
import numpy as np
np_X_tokens = np.asarray(flatten(X_tokens))

In [22]:
words, counts = np.unique(np_X_tokens, return_counts=True)
len(words)

120120

In [23]:
freq, wrds = (list(i) for i in zip(*(sorted(zip(counts, words), reverse=True))))

In [24]:
freq_words = []
no_words = []
for f in sorted(np.unique(freq), reverse=True):
    freq_words.append(f)
    no_words.append(freq.count(f))

In [25]:
n=5000
features=wrds[0:n]

In [26]:
dictionary_train = {}
doc_num = 1
for doc_words in X_tokens:
    dictionary_train[doc_num]={}
    for word in doc_words:
        if word in dictionary_train[doc_num]:
            dictionary_train[doc_num][word]+=1
        else:
            dictionary_train[doc_num][word]=1
    doc_num+=1

In [27]:
X_train = []
for k in dictionary_train.keys():
    row = []
    for f in features:
        if(f in dictionary_train[k].keys()):
            row.append(dictionary_train[k][f]) 
        else:
            row.append(0)
    X_train.append(row)

In [28]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

In [29]:
Y_tokens = []
for document in x_test:
        Y_tokens.append(flatten(tokenize_and_preprocess(document)))

In [30]:
dictionary_test = {}
doc_num = 1
for doc_words in Y_tokens:
    dictionary_test[doc_num]={}
    for word in doc_words:
        if word in dictionary_test[doc_num]:
            dictionary_test[doc_num][word]+=1
        else:
            dictionary_test[doc_num][word]=1
    doc_num+=1

In [31]:
X_test = []
for k in dictionary_test.keys():
    row = []
    for f in features:
        if(f in dictionary_test[k].keys()):
            row.append(dictionary_test[k][f]) 
        else:
            row.append(0)
    X_test.append(row)

In [32]:
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

In [33]:
def naive_bayes_classifier(X_train, y_train):
    result = {"TOTAL_DATA":len(y_train)}
    classes, counts = np.unique(y_train, return_counts=True)    
    for i in range(len(classes)):
        curr_class = classes[i]
        result[curr_class] = {}
        X_tr_curr = X_train[y_train == curr_class]
        total_count=counts[i]
        for j in range(X_train.shape[1]):
            feature_name=features[j]
            feature_sum=np.sum(X_tr_curr[:,j])
            result[curr_class][feature_name] = feature_sum 
        result[curr_class]["TOTAL_COUNT"] = total_count
    return result

In [34]:
def probablity(dictionary_train, x, curr_class):
    output = np.log(dictionary_train[curr_class]["TOTAL_COUNT"]) - np.log(dictionary_train["TOTAL_DATA"])
    num_words = len(x)
    for j in range(num_words):
        if(x[j] in dictionary_train[curr_class].keys()):
            xj = x[j]
            count_curr_class_equal_xj = dictionary_train[curr_class][xj] + 1
            count_curr_class = dictionary_train[curr_class]["TOTAL_COUNT"] + len(dictionary_train[curr_class].keys())
            curr_xj_prob = np.log(count_curr_class_equal_xj) - np.log(count_curr_class)
            output = output + curr_xj_prob
        else:
            continue 
    return output

In [35]:
def predictunit(dictionary_train, x):
    classes = dictionary_train.keys()
    best_p = -10000
    best_class = -1
    for curr_class in classes:
        if(curr_class == "TOTAL_DATA"):
            continue
        p_curr_class = probablity(dictionary_train, x, curr_class)
        if(p_curr_class > best_p):
            best_p = p_curr_class
            best_class = curr_class
            
    return best_class

In [36]:
def predict(dictionary_train, X_test):
    Y_pred = []
    for x in X_test:
        y_predicted = predictunit(dictionary_train, x)
        Y_pred.append(y_predicted)
    
    return Y_pred

In [37]:
model = naive_bayes_classifier(X_train, y_train)

In [38]:
X_test = []

for key in dictionary_test.keys():
    X_test.append(list(dictionary_test[key].keys()))

In [39]:
prediction = predict(model, X_test)

In [40]:
prediction = np.asarray(prediction)

In [41]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
accuracy_score(y_test, prediction)

0.5192519251925193

In [42]:
print(confusion_matrix(y_test, prediction))

[[320   0   0   0   0   1   0   0   0   0   0   2   0   1   0  32   3  36
   71  29]
 [  0 319   6   4   0  56   0   0   0   0   0  31   0   3   2   4   1  13
   41   1]
 [  2  87  99  39   0 155   0   0   0   0   0  60   0   2   1   0   5   4
   48   1]
 [  1  55  15 245   0  38   4   0   0   0   0  60   1   5   3   3   2   5
   46   0]
 [  0  75  13  77  92  29   2   0   0   0   0  94   1   5   1   1   3  11
   69   0]
 [  0  40   5   1   0 379   1   0   0   0   0  17   0   1   1   0   2   0
   20   1]
 [  1  57  11  53   3  30 109  13   1   1   0  45   9   8   7   1   6  32
  117   1]
 [  0   9   1   0   0   5   2 120   1   1   0  19   0   5  11   0  11  54
  268   0]
 [  0   2   1   2   0   6   1   4 125   0   0  10   0   4   5   1  23  38
  295   0]
 [  0   7   1   0   0   2   0   0   0 265  20   4   0   2   1   2  12  31
  161   1]
 [  1   1   1   0   0   1   0   0   0   2 367   5   0   0   0   2   4   6
  123   1]
 [  1   2   1   0   0   1   0   0   0   0   0 410   0   1   0   0

In [43]:
print(classification_report(y_test, prediction))

                          precision    recall  f1-score   support

             alt.atheism       0.68      0.65      0.66       495
           comp.graphics       0.43      0.66      0.53       481
 comp.os.ms-windows.misc       0.61      0.20      0.30       503
comp.sys.ibm.pc.hardware       0.56      0.51      0.53       483
   comp.sys.mac.hardware       0.97      0.19      0.32       473
          comp.windows.x       0.52      0.81      0.64       468
            misc.forsale       0.92      0.22      0.35       505
               rec.autos       0.88      0.24      0.37       507
         rec.motorcycles       0.98      0.24      0.39       517
      rec.sport.baseball       0.97      0.52      0.68       509
        rec.sport.hockey       0.95      0.71      0.81       514
               sci.crypt       0.43      0.82      0.56       501
         sci.electronics       0.81      0.09      0.17       487
                 sci.med       0.85      0.58      0.69       532
         