In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold, ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn import tree
from collections import defaultdict
import math
import re

In [21]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
header = [
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our",
    "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail",
    "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses",
    "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", "word_freq_hpl",
    "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857",
    "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology", "word_freq_1999",
    "word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs", "word_freq_meeting",
    "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu", "word_freq_table",
    "word_freq_conference", "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
    "char_freq_#", "capital_run_length_average", "capital_run_length_longest", "capital_run_length_total",
    "is_spam"
]
source_data_df = pd.read_csv(data_url, names=header)

In [22]:
source_data_df.describe()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


# Correlation analysis

In [23]:
# Covariance matrix
correlation_matrix = source_data_df.corr()
correlation_values = correlation_matrix["is_spam"].sort_values(ascending=False)
print(correlation_values)

is_spam                       1.000000
word_freq_your                0.383234
word_freq_000                 0.334787
word_freq_remove              0.332117
char_freq_$                   0.323629
word_freq_you                 0.273651
word_freq_free                0.263215
word_freq_business            0.263204
capital_run_length_total      0.249164
word_freq_our                 0.241920
char_freq_!                   0.241888
word_freq_receive             0.234529
word_freq_over                0.232604
word_freq_order               0.231551
word_freq_money               0.216111
capital_run_length_longest    0.216097
word_freq_internet            0.206808
word_freq_email               0.204208
word_freq_all                 0.196988
word_freq_addresses           0.195902
word_freq_credit              0.189761
word_freq_mail                0.138962
word_freq_people              0.132927
word_freq_make                0.126208
capital_run_length_average    0.109999
word_freq_font           

In [24]:
# Most correlated features
feature_correlation_pairs = []
for feature, value in correlation_values.items():
    feature_correlation_pairs.append((feature, abs(value)))
    
sorted_feature_correlation_pairs = sorted(feature_correlation_pairs, key=lambda pair: pair[1], reverse=True)    

most_correlated_features = [
    feature_correlation_pair[0]
    for feature_correlation_pair in sorted_feature_correlation_pairs 
]
for sorted_feature_correlation_pair in sorted_feature_correlation_pairs:
    print(sorted_feature_correlation_pair)

('is_spam', 1.0)
('word_freq_your', 0.3832338192835756)
('word_freq_000', 0.3347870388457389)
('word_freq_remove', 0.3321174156141586)
('char_freq_$', 0.3236288064980446)
('word_freq_you', 0.2736512865572106)
('word_freq_free', 0.26321469903669603)
('word_freq_business', 0.2632039828223917)
('word_freq_hp', 0.2567229163126616)
('capital_run_length_total', 0.24916412436334212)
('word_freq_our', 0.2419204377148765)
('char_freq_!', 0.24188836701122152)
('word_freq_receive', 0.23452927138249544)
('word_freq_hpl', 0.23296768680660979)
('word_freq_over', 0.232604300172187)
('word_freq_order', 0.23155143128601532)
('word_freq_money', 0.21611098224724426)
('capital_run_length_longest', 0.21609669406935564)
('word_freq_internet', 0.20680847576170805)
('word_freq_email', 0.20420813879714014)
('word_freq_all', 0.1969879726831455)
('word_freq_addresses', 0.19590245671258405)
('word_freq_credit', 0.1897611485080607)
('word_freq_george', 0.1834040051933605)
('word_freq_1999', 0.17804545978118416)
('

# Select features

In [27]:
ready_data_df = source_data_df.copy()

# Convert pandas to numpy arrays

In [28]:
# X and Y are the input and output of the classifier algorithm
y = ready_data_df["is_spam"].astype(int).values

ready_data_df.drop("is_spam", axis=1, inplace=True)

# X is the training and test data (we sill use cv validation to test the accuracy of each algorithm)
X = ready_data_df.values

# Scaler

In [29]:
scaler = StandardScaler

# Runner

In [30]:
class ClassifierRunner(object):
    
    def __init__(self, pipeline, parameters, debug=False, num_folds=5):
        self.pipeline = pipeline
        self.parameters = parameters
        self.grid_search = GridSearchCV(self.pipeline, self.parameters, cv=num_folds)
        self.debug = debug
        
    def fit(self, X, y):
        self.grid_search.fit(X, y)

    @property
    def best_params(self):
        return self.grid_search.best_params_
        
    def get_scores(self, X, y, n_splits=5, test_size=0.2, random_seed=0):
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
        scores = cross_val_score(self.grid_search.best_estimator_, X, y, cv=cv)
        return scores
    
    @property
    def feature_importances(self):
        classifier_step_index = 0
        for step_name, step_process in self.grid_search.best_estimator_.steps:
            if step_name == "classifier":
                break
            classifier_step_index += 1
        feature_importances = self.grid_search.best_estimator_.steps[classifier_step_index][1].feature_importances_
        return sorted(zip(feature_importances, selected_features), reverse=True)
    
    def predict(self, X_test):
        prediction = self.grid_search.predict(X_test)
        return prediction

# KNN classifier

In [None]:
# KNN Pipeline
pipeline = Pipeline([
        ('reduce_dimensions', PCA()),
        ('minmaxscaler', scaler()),
        ('classifier', KNeighborsClassifier())
    ])

k_values = [1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]
parameters = {
    'reduce_dimensions__random_state': [1, 2, 3, 4, 5],
    'reduce_dimensions__n_components': [10, 25, 50, None],
    'classifier__n_neighbors': k_values, 
    'classifier__weights': ["uniform", "distance"],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

classifier = ClassifierRunner(pipeline=pipeline, parameters=parameters)

classifier.fit(X, y)

print ("Best parameters found: ")
print (classifier.best_params)

scores = classifier.get_scores(X, y)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))

# Random forest classifier

In [None]:
# Random forest Pipeline

pipeline = Pipeline([
        ('reduce_dimensions', PCA()),
        ('classifier', RandomForestClassifier())
    ])


parameters = {
    'reduce_dimensions__random_state': [1, 2, 3, 4, 5],
    'reduce_dimensions__n_components': [10, 25, 50, None],
    'classifier__max_depth' : [4, 6, 12, 20, None],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__n_estimators': [2, 10, 50, 100],
    'classifier__max_features': ['sqrt', 'auto', 'log2', None],
    'classifier__min_samples_split': [2, 3, 10],
    'classifier__min_samples_leaf': [1, 3, 10],
    'classifier__bootstrap': [True, False],
    'classifier__n_jobs': [-1]
}

classifier = ClassifierRunner(pipeline=pipeline, parameters=parameters)

classifier.fit(X, y)

print ("Best parameters found: ")
print (classifier.best_params)

feature_importances = classifier.feature_importances
print("Selected features by importance: {}".format(feature_importances))

scores = classifier.get_scores(X, y)
print ("Expected performance: {:.2f}% (+/-{:.2f}).".format(np.mean(scores)*100., np.std(scores)*100.))