In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import csv
from numpy.random import RandomState
from scipy.special import logsumexp
import operator



### Creating NaiveBayesian class

In [2]:
class NaiveBayes:
    def fit(self, X, y):
        raise NotImplementedError()
    def _predict_log_proba(self, X):
        jll = self._joint_log_likelihood(X)
        log_prob = logsumexp(jll, axis=1)
        return jll - np.atleast_2d(log_prob).T
    def predict_proba(self, X):
        return np.exp(self._predict_log_proba(X))
    def predict(self, X):
        return self._classes[np.argmax(self._joint_log_likelihood(X), axis=1)]
    def score(self, X, y):
        pred = self.predict(X)
        score = 0.0
        for i in range(pred.shape[0]):
            if (pred[i] == y[i]):
                score += 1
        return score / pred.shape[0]

### Creating Gaussian bayes class to calculate likelihood and fit model for naive bayes

In [None]:
class GaussianBayes(NaiveBayes):
    def fit(self, X, y):
        #y has 2 classes spam and not spam
        unq, unq_counts = np.unique(y, return_counts=True)
        # K x 1
        self._classes = unq
        self.priors = unq_counts / y.shape[0]
        
        self.num_classes = len(unq)
        mean = []
        var = []
        for y_i in unq:
            X_i = X[y == y_i, :]
            mean.append(np.mean(X_i, axis=0))
            var.append(np.var(X_i, axis=0))
        # K x N
        self.mean = self._weights = np.vstack(mean) 
        # K x N
        self.var = np.vstack(var)
        
    def _joint_log_likelihood(self, X):
        prob = []
        epsilon = 1e-9
        
        for k in range(self.num_classes):
            mean = self.mean[k, :]
            var = self.var[k, :] + epsilon # add epsilon so we never divide by zero
            gauss = -0.5 * np.sum(np.log(2.0 * np.pi * var))
            gauss -= 0.5 * np.sum(np.square(X - mean) / var, axis=1)
            prob.append(np.log(self.priors[k]) + gauss)

        prob = np.vstack(prob).T
        return prob

### Initializing Data, Initializing and run Naive bayes classifer for 50 times and take mean of the scores

In [19]:
word_labels = ['address', 'all', '3d', 'our', 'over', 'remove', 'internet','order', 
               'mail', 'receive', 'will', 'people', 'report', 'addresses','free', 
               'business', 'email', 'you', 'credit', 'your', 'font', '000',
               'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
               'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct',
               'cs', 'meeting', 'original', 'project', 're', 'edu', 'table', 'conference']
scores = []
weights = []
iterations = 50
k = 5

for i in range(iterations):
    data = []
    f = open('spambase.csv')
    reader = csv.reader(f)
    next(reader, None)
    for row in reader:
        data.append(row)
    f.close()

    X = np.array([x[:-1] for x in data]).astype(np.float)
    y = np.array([x[-1] for x in data]).astype(np.float)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RandomState())
    
    #First 48 features chosen
    X_train = X_train[:, 0:48]
    X_test = X_test[:, 0:48]
    clf = GaussianBayes()
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

print('Average Accuracy: ',(np.mean(scores)*100))

weights = clf._weights
print('\n')
print('Top %d not-spam features:' % k)
print(sorted(zip(word_labels, weights[0, :]), reverse=True, key=operator.itemgetter(1))[:k])
print('\n')
print('Top %d spam features:' % k)
print(sorted(zip(word_labels, weights[1, :]), reverse=True, key=operator.itemgetter(1))[:k])
print('\n')

Average Accuracy:  80.5971014493


Top 5 not-spam features:
[('credit', 1.2791286519733476), ('650', 1.236740133264991), ('hpl', 0.90985135827780572), ('people', 0.51829830855971315), ('font', 0.43942593541773467)]


Top 5 spam features:
[('credit', 2.2459889676910931), ('font', 1.3650039401103229), ('business', 0.5447202521670611), ('people', 0.53241922773837602), ('over', 0.52296296296296274)]


