In [19]:
from nltk.tokenize import WordPunctTokenizer       # split words and punctuations
import string
from string import digits
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer

def preprocess(stringDoc):  # stringDoc is a list of lines from one doc
    
    def splitWord(words):       
        new_punctuations = (string.punctuation)
        table = str.maketrans('', '', new_punctuations)     # remove punctuations
        tk = WordPunctTokenizer() 
        new_word = tk.tokenize(words)
        new_word = [word.translate(table) for word in new_word]            # delete those punctuations
        return new_word 
    
    
    def preprocess(words):      # input is an array of words (array of strings)

        table1 = str.maketrans('', '', '\t')        # remove tabs
        words = [word.translate(table1) for word in words]    # translate func works with string    

        #there are enters at several lines, or at the end of a line when reading line by line => remove them
        table3 = str.maketrans('', '', '\n')
        words = [word.translate(table3) for word in words]

        res = []
        for word in words:
            tem = splitWord(word)
            for i in tem:
                res.append(i)
        words = res

        # remove spaces (maybe after translation, --- becomes space)
        words = [word for word in words if word]

        # remove numbers
        words = [word for word in words if not word.isdigit()]

        # remove number from word (ex: 3DDD)
        remove_digits = str.maketrans('', '', digits) 
        words = [word.translate(remove_digits) for word in words]

        # after removing numbers, there might be single characters or 2-letter-characters, remove them
        words = [word for word in words if len(word) > 2]

        # lowercase
        words = [word.lower() for word in words]

        # final step: remove left blank spaces 
        words = [word for word in words if word]

        return words
    
    def remove_metadata(listOfLines):
        for i in range(len(listOfLines)):
            if (listOfLines[i] == '\n'):
                break
        newList = listOfLines[i+1:]
        return newList
    
    def remove_stopwords(words):          # input is an array of words
        stop_words = set(stopwords.words('english'))
        number = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
        '4th', '5th', '6th', '7th', '8th', '9th', '10th']
        for i in number:
            stop_words.add(i)
        new_words = [word for word in words if word not in stop_words]
        return new_words
    
    
 

    def sentence_tokenize(line):          # token a single sentence in form of a string, return an array of words
        ps = PorterStemmer()
        words = line.strip().split()      # strip() removes leading and trailing spaces, split() removes inbetween spaces, split into list of words 
        words = preprocess(words)         # after process, words are in lowercase => can check with stopwords
        words = remove_stopwords(words)
        words = [ps.stem(word) for word in words]
        return words
    
    def tokenize(listOfLines):           # token a whole document in a form of list of lines (strings)
        # firstly, remove metadata
        #newList = remove_metadata(listOfLines)             

        wordsOfDoc = []     # an array to store words of one document
        for line in listOfLines:
            words = sentence_tokenize(line)
            if (len(words) > 0):              # remove enter-character-line
                wordsOfDoc.append(words)
        return wordsOfDoc
    
    def flatten(list):
        res = []
        for i in list:
            for j in i: 
                res.append(j)
        return res
    
    result = flatten(tokenize(stringDoc))    # transform a document to a vector
    
    return result      # an array of cleaned words from one doc
            

In [146]:
from collections import defaultdict
import numpy as np

class NaiveBayes:
    
    def __init__(self, listClasses):
        self.classes = listClasses
    
    def addToBow(self, example, cat_idx):     # example is a list of words from one doc
        for word in example:
            self.bow_dicts[cat_idx][word] += 1
    
    def train(self, dataset, labels):

        self.dataset = dataset
        self.labels = labels
        self.bow_dicts = np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        if not isinstance(self.labels, np.ndarray): 
            self.labels = np.array(self.labels)
            
        # create bow_dicts
        for cat_idx, cat in enumerate(self.classes):
            idx_of_cat_in_labels = np.where(self.labels == cat)[0]   # positions of doc belonging to cat
            cat_dataset = []
            for idx in idx_of_cat_in_labels:
                cat_dataset.append(dataset[idx])
            for example in cat_dataset:
                self.addToBow(example, cat_idx)
                
        
        prob_classes = np.zeros(self.classes.shape[0])    # pre prob P(c_i)
        cat_words_count = np.empty(self.classes.shape[0])  # total #of words of each category
        denoms = np.empty(self.classes.shape[0])
        
        for cat_idx, cat in enumerate(self.classes):
           
            # caculate P(c_i)
            prob_classes[cat_idx] = np.sum(self.labels == cat) / float(self.labels.shape[0])
            
            # caculate total counts of all the words of each class
            cat_words_count[cat_idx] = sum(list(self.bow_dicts[cat_idx].values()))
        
        #caculate denominator of P(word_j|c_i) for each class
        denoms = np.array([cat_words_count[cat_idx] + len(trimmed_uniqueWords) for cat_idx, cat in enumerate(self.classes)])
            
        self.prob_classes = prob_classes
        self.denoms = denoms
        
        
    def getExampleProb(self, example):     # example is a list of words of one test doc
        likelihood_prob = np.zeros(self.classes.shape[0])   # store log(P(t_j|c_i))
        post_prob = np.zeros(self.classes.shape[0])     # store post prob of each class
        for cat_idx, cat in enumerate(self.classes):
            for word in example:
                word_counts_in_cat = self.bow_dicts[cat_idx].get(word, 0) + 1
                word_prob = word_counts_in_cat / float(self.denoms[cat_idx])
                likelihood_prob[cat_idx] += np.log(word_prob)
            post_prob[cat_idx] = likelihood_prob[cat_idx] + np.log(self.prob_classes[cat_idx])
        
        return post_prob
    
    def test(self, testset):
        predictions = []
        for example in testset:
            post_prob = self.getExampleProb(example)
            predictions.append(self.classes[np.argmax(post_prob)])
        return np.array(predictions)

In [169]:
from os import listdir
from os.path import join

train_path = "D:\\movedFromC\\123\\20192\\PRJ2\\20news-bydate\\20news-bydate-train"
test_path = "D:\\movedFromC\\123\\20192\\PRJ2\\20news-bydate\\20news-bydate-test"

folders = [f for f in listdir(train_path)]     # labels

singleBOW = []
train_label = []
singleBOWTest = []
test_label = []
# read train data
for i in range(len(folders)):
    files = listdir(join(train_path, folders[i]))
    for j in range(len(files)):
        contentFile = open(join(join(train_path, folders[i]), files[j]), "r")
        singleBOW.append(preprocess(contentFile.readlines()))
        train_label.append(i)
        
# read test data
for i in range(len(folders)):
    files = listdir(join(test_path, folders[i]))
    for j in range(len(files)):
        contentFile = open(join(join(test_path, folders[i]), files[j]), "r")
        singleBOWTest.append(preprocess(contentFile.readlines()))
        test_label.append(i)

In [170]:
uniqueWords = []                       # a dictionary of whole docs
for i in range(len(singleBOW)):
    uniqueWords = set(uniqueWords).union(set(singleBOW[i]))


In [171]:
len(uniqueWords)

76336

In [172]:
dfs = dict.fromkeys(uniqueWords, 0)

for tem in singleBOW:
    for word in tem:
        dfs[word] += 1

In [238]:
df = 10

trimmed_uniqueWords = [word for word in uniqueWords if dfs[word] > df]

In [239]:
len(trimmed_uniqueWords)

12493

In [240]:
trimmed_singleBOW = []
for tem in singleBOW:
    trimmed_tem = [word for word in tem if dfs[word] > df]
    trimmed_singleBOW.append(trimmed_tem)

In [176]:
len(trimmed_singleBOW)

11314

In [241]:
import time
nb = NaiveBayes(np.unique(train_label))
t = time.time()
nb.train(trimmed_singleBOW, train_label)
train_time = time.time() - t

In [242]:
t = time.time()

predictions = nb.test(singleBOWTest)

accuracy = np.sum(predictions==test_label) / float(len(test_label))

accuracy * 100

test_time = time.time() - t

In [243]:
accuracy * 100

79.87254381306425

In [244]:
train_time

0.804133415222168

In [245]:
test_time

108.52934098243713

In [246]:
# join. for fit_transform
preprocessedData = []
for i in range(len(singleBOW)):
    preprocessedData.append(" ".join(singleBOW[i]))

In [247]:
preprocessedDataTest = []
for i in range(len(singleBOWTest)):
    preprocessedDataTest.append(" ".join(singleBOWTest[i]))

In [323]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(min_df = 11)

X_train_tf = tf_vectorizer.fit_transform(preprocessedData)



In [325]:
X_test_tf = tf_vectorizer.transform(preprocessedDataTest)

In [326]:
from sklearn.naive_bayes import MultinomialNB
t_lib = time.time()

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_label)

train_time_lib = time.time() - t_lib

In [327]:
train_time_lib

0.07823395729064941

In [328]:
t_lib = time.time()
y_pred = naive_bayes_classifier.predict(X_test_tf)

accuracy = np.sum(y_pred==test_label) / float(len(test_label))

accuracy * 100

test_time_lib = time.time() - t_lib

In [329]:
accuracy * 100

80.6558682952735

In [330]:
test_time_lib

0.03663825988769531