In [2]:
%matplotlib inline
import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob

from math import ceil, log
from sklearn.base import BaseEstimator
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, make_scorer

In [3]:
def add_map(mp, key, val):
    if key not in mp:
        mp[key] = val
    else:
        mp[key] += val

def make_data_entry(filename):
    res = {}
    res['is_spam'] = "spmsg" in filename
    with open(filename, 'r') as f:
        l = next(f)
        for n in l.split()[1:]:
            res[int(n)] = 1
        for l in f:
            for n in l.split():
                res[int(n)] = 1
    return res    

def make_dataset():
    files = glob.glob('data/part*/*.txt')
    entries = list(map(make_data_entry, files))
    return pd.DataFrame.from_dict(entries)

In [4]:
data = make_dataset().fillna(0)

In [5]:
data.head()

In [6]:
def accuracy_score(true, predicted):
    return (predicted == true).sum() / len(true)

def f1_score(true, predicted):
    tp = ((predicted == 1) * (true == 1)).sum()
    rec = tp / (true == 1).sum()
    prec = tp / (predicted == 1).sum()
    return 2 * prec * rec / (prec + rec)

In [7]:
def simple_selector(probs):
    return np.argmax(probs, axis=1)

def threshold_selector(probs, threshold):
    return np.array([1 if p[1] - p[0] > threshold else 0 for p in probs])

def threshold_selector_factory(threshold):
    return lambda probs: threshold_selector(probs, threshold)

def recall0_selector(probs, true_classes, recall):
    diffs = np.array([p[1] - p[0] for p in probs])
    class0_diffs = np.sort(diffs[np.nonzero(true_classes == 0)])
    if (recall == 0):
        return np.array([1 if d >= class0_diffs[0] else 0 for d in diffs])
    else:
        k = ceil(class0_diffs.shape[0] * recall)
        print("Used threshold:", class0_diffs[k - 1])
        return np.array([1 if d > class0_diffs[k - 1] else 0 for d in diffs])

def recall0_selector_factory(true_classes, recall):
    return lambda probs: recall0_selector(probs, true_classes, recall)

In [8]:
class NaiveBayes(BaseEstimator):
    
    def __init__(self, smooth_coef=1e-20, lambdas=None, selector=simple_selector):
        self.smooth_coef = smooth_coef
        self.lambdas = lambdas
        self.selector = selector
        
    def fit(self, X, y):
        total_counts = np.sum(X, axis=0)
        total_words = np.sum(total_counts)
        classes = np.unique(y)
        
        if self.lambdas is None:
            self.lambdas = np.ones(classes.shape[0])
            
        self.classes = classes
        self.word_probas = np.zeros((classes.shape[0], total_counts.shape[0]))
        self.class_probas = np.zeros(classes.shape[0])
        for cl in classes:
            cl_count = y[y == cl].shape[0]

            # smooth_coef затем, чтобы не вышло нулевых вероятностей
            self.word_probas[cl] = (np.sum(X[y == cl], axis=0) + self.smooth_coef) / (cl_count + self.smooth_coef * y.shape[0])
            self.class_probas[cl] = float(cl_count) / y.shape[0]
    
    def class_log_proba(self, x, cl):
        return self.lambdas[cl] + np.log(self.class_probas[cl]) + np.sum(np.log(self.word_probas[cl][x > 0]))
    
    def predict_log_proba(self, X):
        probas = np.zeros((X.shape[0], self.classes.shape[0]))
        for i in np.arange(0, X.shape[0]):
            for cl in self.classes:
                probas[i][cl] = self.class_log_proba(X[i], cl)
        return probas
        
    def predict(self, X):
        probas = self.predict_log_proba(X)
        return self.selector(self.predict_log_proba(X))
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))

In [9]:
def class_accuracy(y_test, y_pred, cl):
    y_cl = y_test[y_test == cl]
    y_pr = y_pred[y_test == cl]
    return float(np.sum(y_cl == y_pr)) / y_cl.shape[0]

In [10]:
X = data.drop('is_spam', axis=1).values
y = data['is_spam'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=43)

In [11]:
nb = NaiveBayes(selector=recall0_selector_factory(y_test, 1.0))
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Ham recall:", class_accuracy(y_test, y_pred, 0))
print("Spam recall:", class_accuracy(y_test, y_pred, 1))

In [16]:
nb1 = NaiveBayes(selector=threshold_selector_factory(345.0))
nb1.fit(X_train, y_train)
y_pred1 = nb1.predict(X_test)
print("Used threshold: 174.0")
print("Ham recall:", class_accuracy(y_test, y_pred1, 0))
print("Spam recall:", class_accuracy(y_test, y_pred1, 1))

In [18]:
nb2 = NaiveBayes(selector=threshold_selector_factory(400.0))
nb2.fit(X_train, y_train)
y_pred2 = nb2.predict(X_test)
print("Used threshold: 176.0")
print("Ham recall:", class_accuracy(y_test, y_pred2, 0))
print("Spam recall:", class_accuracy(y_test, y_pred2, 1))

In [14]:
accuracy_score(y_test, y_pred)

In [15]:
f1_score(y_test, y_pred)

In [94]:
confusion_matrix(y_test, y_pred)

In [17]:
%time
ls = np.arange(1, 300, 5)
sc0 = []
sc1 = []

for l in ls:
    nbc = NaiveBayes(lambdas=[l, 1])
    nbc.fit(X_train, y_train)
    y_pred = nbc.predict(X_test)
    
    sc0.append(class_accuracy(y_test, y_pred, 0))
    sc1.append(class_accuracy(y_test, y_pred, 1))

plt.plot(ls, sc0, 'r-', ls, sc1, 'b-')
plt.xlabel('Weight of "false spam" error')
plt.ylabel('Success ratio')
plt.legend(['Ham', 'Spam'])
plt.grid()
plt.show()

In [92]:
nbc = NaiveBayes(lambdas=[1, 1])
nbc.fit(X_train, y_train)
y_pred = nbc.predict(X_test)
print('Ham recall: {}'.format(class_accuracy(y_test, y_pred, 0)))
print('Spam recall: {}'.format(class_accuracy(y_test, y_pred, 1)))

In [115]:
cv_acc_scores = cross_val_score(NaiveBayes(), X, y, scoring=make_scorer(accuracy_score), cv=10)
cv_acc_scores.mean()

In [116]:
cv_f1_scores = cross_val_score(NaiveBayes(), X, y, scoring=make_scorer(f1_score), cv=10)
cv_f1_scores.mean()