In [2]:
# load test and train data
from sklearn.datasets import fetch_20newsgroups
data_train = fetch_20newsgroups(subset = 'train')
data_test = fetch_20newsgroups(subset = 'test')

In [12]:
print(data_train.data[3])
print(data_test.data[0])

From: jgreen@amber (Joe Green)
Subject: Re: Weitek P9000 ?
Organization: Harris Computer Systems Division
Lines: 14
Distribution: world
NNTP-Posting-Host: amber.ssd.csd.harris.com
X-Newsreader: TIN [version 1.1 PL9]

Robert J.C. Kyanko (rob@rjck.UUCP) wrote:
> abraxis@iastate.edu writes in article <abraxis.734340159@class1.iastate.edu>:
> > Anyone know about the Weitek P9000 graphics chip?
> As far as the low-level stuff goes, it looks pretty nice.  It's got this
> quadrilateral fill command that requires just the four points.

Do you have Weitek's address/phone number?  I'd like to get some information
about this chip.

--
Joe Green				Harris Corporation
jgreen@csd.harris.com			Computer Systems Division
"The only thing that really scares me is a person with no sense of humor."
						-- Jonathan Winters

From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posti

In [25]:
import numpy as np
import pandas as pd
import operator
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [26]:
# function to make dictionary of words with their frequency as values
def makeDict(data):
    
    result={}
    word_dict={}
    #word _dict stores all the words appearing in the text
    for i in range(len(data)):
        result[i]={}
        temp={}
        total = 1
        for word in data[i].split():
            #remove unwanted characters
            word = word.strip()
            word = word.strip(",")
            word = word.strip(":")
            word = word.strip("'")
            word = word.strip(".")
            word = word.strip("?")
            word = word.strip("!")
            word = word.strip("(")
            word = word.strip(")")
            word = word.strip('"')
            total = total+1
            # update in dictionary
            if( word in temp.keys()):
                temp[word] = temp[word] + 1
                
            else:
                temp[word] = 1
            if( word in word_dict.keys()):
                word_dict[word] = word_dict[word] + 1
            else:
                word_dict[word] = 1
        #store total number of words
        temp["total_count"] = total
        result[i] = temp
    return (result,word_dict)

In [27]:
# function to create feature table for training data using the dictionary of words created
def makeFeatureTable(data_dict,word_dict,y_train):
    
    # list of frequently occuring words which do not help in classifying the document
    remove_words = ["a", "about", "above", "across", "after", "afterwards", 
"again", "all", "almost", "alone", "along", "already", "also",    
"although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "as", "at", "be", "became", "because", "become","becomes", "becoming", "been", "before", "behind", "being", "beside", "besides", "between", "beyond", "both", "but", "by","can", "cannot", "cant", "could", "couldnt", "de", "describe", "do", "done", "each", "eg", "either", "else", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "find","for","found", "four", "from", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "indeed", "is", "it", "its", "itself", "keep", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next","no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part","perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "she", "should","since", "sincere","so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take","than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they",
"this", "those", "though", "through", "throughout",
"thru", "thus", "to", "together", "too", "toward", "towards",
"under", "until", "up", "upon", "us",
"very", "was", "we", "well", "were", "what", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
"wherein", "whereupon", "wherever", "whether", "which", "while", 
"who", "whoever", "whom", "whose", "why", "will", "with",
"within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"]
    
    
    
    for x in remove_words:
        if x in word_dict.keys():
            del word_dict[x]
    #sort the dictionary according to frequency of words and select most ocuuring words
    
    sorted_dict = sorted(word_dict.items(),key = operator.itemgetter(1))
    n = len(sorted_dict)
    feature_list = []      # to store features that are to be considered
    for i in range(n-1,n-1001,-1):
        feature_list.append(sorted_dict[i][0])
    feature_table = np.zeros(shape=(len(y_train),1000))      
    
    #create feature table 
    for i in range(len(feature_table)):
        total = data_dict[i]["total_count"]
        for j in range(1000):
            if feature_list[j] in data_dict[i].keys():
                feature_table[i][j] = data_dict[i][feature_list[j]]
    return (feature_table,feature_list)

In [28]:
# function to create feature table for testing data
def makeFeatureTable_test(data_dict_test,feature_list):
    feature_table = np.zeros(shape=(len(data_dict_test),len(feature_list)))
    
    # make feature table using feature list and dictionary
    for i in range(len(data_dict_test)):
        total = data_dict_test[i]["total_count"]
        for j in range(len(feature_list)):
            if feature_list[j] in data_dict_test[i].keys():
                feature_table[i][j] = data_dict_test[i][feature_list[j]]
    return feature_table

In [29]:
# function that creates dictionary of probablities according to training data
def fit(X_train, Y_train):
    result = {}
    class_values = set(Y_train)
    for current_class in class_values:
        result[current_class] = {}
        result["total_data"] = len(Y_train)
        current_class_rows = (Y_train == current_class)
        X_train_current = X_train[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        num_features = X_train.shape[1]
        result[current_class]["total_count"] = len(Y_train_current)
        for j in range(1, num_features + 1):
            result[current_class][j] = {}
            all_possible_values = set(X_train[:, j - 1])
            for current_value in all_possible_values:
                result[current_class][j][current_value] = (X_train_current[:, j - 1] == current_value).sum()
    return result

In [30]:
def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    num_features = len(dictionary[current_class].keys()) - 1;
    for j in range(1, num_features + 1):
        xj = x[j - 1]
        count_current_class_with_value_xj = 1
        if(xj in dictionary[current_class][j].keys()):
            count_current_class_with_value_xj+= dictionary[current_class][j][xj]
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())
        current_xj_probablity = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
        output = output + current_xj_probablity
    return output

In [32]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [33]:
def predict(dictionary, X_test):
    y_pred = []
    for x in X_test:
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [34]:
x_train = data_train.data
y_train = data_train.target
x_test = data_test.data
y_test = data_test.target
data_dict,word_dict = makeDict(x_train)
data_dict_test,temp = makeDict(x_test)


In [35]:
feature_table,feature_list = makeFeatureTable(data_dict,word_dict,y_train)
feature_table_test = makeFeatureTable_test(data_dict_test,feature_list)

In [36]:
clf = MultinomialNB()
clf.fit(feature_table,y_train)
y_pred = clf.predict(feature_table_test)

print(accuracy_score(y_test,y_pred))

0.5586829527349974


In [29]:
dictionary = fit(feature_table,y_train)
y_pred = predict(dictionary,feature_table_test)


In [30]:
print(accuracy_score(y_test,y_pred))

0.5169941582580988
