In [65]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
import glob
from sklearn.feature_extraction.text import CountVectorizer

In [66]:
%matplotlib inline

In [67]:
names = ["word_freq_make",
         "word_freq_address",
         "word_freq_all",
         "word_freq_3d",
         "word_freq_our",
         "word_freq_over",
         "word_freq_remove",
         "word_freq_internet",
         "word_freq_order",
         "word_freq_mail",
         "word_freq_receive",
         "word_freq_will",
         "word_freq_people",
         "word_freq_report",
         "word_freq_addresses",
         "word_freq_free",
         "word_freq_business",
         "word_freq_email",
         "word_freq_you",
         "word_freq_credit",
         "word_freq_your",
         "word_freq_font",
         "word_freq_000",
         "word_freq_money",
         "word_freq_hp",
         "word_freq_hpl",
         "word_freq_george",
         "word_freq_650",
         "word_freq_lab",
         "word_freq_labs",
         "word_freq_telnet",
         "word_freq_857",
         "word_freq_data",
         "word_freq_415",
         "word_freq_85",
         "word_freq_technology",
         "word_freq_1999",
         "word_freq_parts",
         "word_freq_pm",
         "word_freq_direct",
         "word_freq_cs",
         "word_freq_meeting",
         "word_freq_original",
         "word_freq_project",
         "word_freq_re",
         "word_freq_edu",
         "word_freq_table",
         "word_freq_conference",
         "char_freq_;",
         "char_freq_(",
         "char_freq_[",
         "char_freq_!",
         "char_freq_$",
         "char_freq_#",
         "capital_run_length_average",
         "capital_run_length_longest",
         "capital_run_length_total",
         "spam_filter"]

In [68]:
spambase = pd.read_csv("./spambase/spambase.data", names=names, header=0)

In [70]:
y = spambase["spam_filter"]

In [71]:
x = spambase.drop('spam_filter', 1)

In [89]:
train_input, test_input, train_spam, test_spam = train_test_split(x, y, test_size=0.40, random_state=15)

# Multinomial

In [91]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [92]:
classifier.fit(train_input, train_spam)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [93]:
classifier.score(train_input, train_spam)

0.79492753623188406

In [94]:
classifier.score(test_input, test_spam)

0.79836956521739133

# Gaussian

In [95]:
from sklearn.naive_bayes import GaussianNB
g_classifier = GaussianNB()

In [96]:
g_classifier.fit(train_input, train_spam)

GaussianNB()

In [97]:
g_classifier.score(train_input, train_spam)

0.81159420289855078

In [98]:
g_classifier.score(test_input, test_spam)

0.80978260869565222

# Bernoulli

In [99]:
from sklearn.naive_bayes import BernoulliNB
b_classifier = BernoulliNB()

In [100]:
b_classifier.fit(train_input, train_spam)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [103]:
b_classifier.score(train_input, train_spam)

0.87898550724637681

In [102]:
b_classifier.score(test_input, test_spam)

0.8880434782608696

# Hard mode

In [None]:
import itertools
combos = list(itertools.combinations(train_input, 15))

In [None]:
def regression_for(combo):
    combo = list(combo)
    input_data = spambase[combo]
    spam = spambase['spam_filter']
    regr = linear_model.LinearRegression()
    regr.fit(input_data, spam)
    return regr, regr.score(input_data, spam)

In [None]:
choices = []

for combo in combos:
    regr, score = regression_for(combo)
    choices.append((combo, score))
                   
best = sorted(choices, key=lambda x: x[1])[-1]
print(best)
regr, score = regression_for(best[0])
print(regr.coef_, regr.intercept_)