In [78]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Email Spam Detection

In this problem, you will fit a linear regression model for detecting e-mail spam.

The "spambase" dataset consists of 4,601 e-mails, of which 1,813 are spam (39.4%).
The dataset contains a processed version of the e-mails wherein 57 real-valued features have been extracted and the spam/non-spam label has been assigned.

In [96]:
# load the data
data = pd.read_csv('spambase.data', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [98]:
col_names = ['attribute '+str(i) for i in range(57)]
col_names.append('label')
data.columns = col_names
data.head(1)

Unnamed: 0,attribute 0,attribute 1,attribute 2,attribute 3,attribute 4,attribute 5,attribute 6,attribute 7,attribute 8,attribute 9,...,attribute 48,attribute 49,attribute 50,attribute 51,attribute 52,attribute 53,attribute 54,attribute 55,attribute 56,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1


**Attribute Information:**

The last column of the dataset denotes whether the e-mail was considered spam (1) or not (0).
Most of the attributes indicate whether a particular word or character was frequently occuring in the e-mail. 
The run-length attributes (55-57) measure the length of sequences of consecutive capital letters. 

- 48 continuous real [0,100] attributes of type word_freq_WORD = percentage of words in the e-mail that match WORD, i.e. 100 * (number of times the WORD appears in the e-mail) / total number of words in e-mail. A "word" in this case is any string of alphanumeric characters bounded by non-alphanumeric characters or end-of-string.

- 6 continuous real [0,100] attributes of type char_freq_CHAR= percentage of characters in the e-mail that match CHAR, i.e. 100 * (number of CHAR occurences) / total characters in e-mail

- 1 continuous real [1,...] attribute of type capital_run_length_average = average length of uninterrupted sequences of capital letters

- 1 continuous integer [1,...] attribute of type capital_run_length_longest = length of longest uninterrupted sequence of capital letters

- 1 continuous integer [1,...] attribute of type capital_run_length_total = sum of length of uninterrupted sequences of capital letters = total number of capital letters in the e-mail

- 1 nominal {0,1} class attribute of type spam = denotes whether the e-mail was considered spam (1) or not (0), i.e. unsolicited commercial e-mail.

See [here](http://archive.ics.uci.edu/ml/datasets/Spambase) for more details.

**Step 1:** Partition the data into a training and test 

In [81]:
X = data.iloc[:,0:57].to_numpy()
y = data.iloc[:,57].to_numpy()

In [82]:
# add bias term
X = np.c_[np.ones(len(y)),X]

In [83]:
permutation = np.random.permutation(range(4600))

In [84]:
X = X[permutation]
y = y[permutation]

In [85]:
X_train, y_train = X[0:3500],y[0:3500]
X_test, y_test = X[3500:],y[3500:]

In [86]:
X_train

array([[  1.   ,   0.   ,   0.   , ...,   3.7  ,  26.   ,  37.   ],
       [  1.   ,   0.   ,   0.   , ...,   3.478,   7.   ,  80.   ],
       [  1.   ,   0.   ,   0.   , ...,   4.36 ,  46.   , 109.   ],
       ...,
       [  1.   ,   0.   ,   0.   , ...,   5.809,  46.   , 122.   ],
       [  1.   ,   0.   ,   0.   , ...,   1.468,  12.   ,  47.   ],
       [  1.   ,   0.   ,   0.   , ...,   1.   ,   1.   ,   8.   ]])

In [87]:
y_train

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [88]:
theta = np.linalg.solve(X_train.T.dot(X_train),X_train.T.dot(y_train))

In [89]:
y_test_pred = X_test.dot(theta)

In [90]:
y_test_pred[y_test_pred>=0.5]=1
y_test_pred[y_test_pred<0.5]=0

In [91]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [92]:
confusion_matrix(y_test,y_test_pred)

array([[658,  34],
       [ 80, 328]], dtype=int64)

In [93]:
accuracy_score(y_test,y_test_pred)

0.8963636363636364