In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Problem 3: Email Spam Detection

The "spambase" dataset consists of 4,601 e-mails, of which 1,813 are spam (39.4%).
The dataset contains a processed version of the e-mails wherein 57 real-valued features have been extracted and the spam/non-spam label has been assigned.

In [2]:
# load the data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/spambase.csv'
data = pd.read_csv(url)
data

Unnamed: 0,attribute 0,attribute 1,attribute 2,attribute 3,attribute 4,attribute 5,attribute 6,attribute 7,attribute 8,attribute 9,...,attribute 48,attribute 49,attribute 50,attribute 51,attribute 52,attribute 53,attribute 54,attribute 55,attribute 56,label
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


**Attribute Information:**

The last column of the dataset denotes whether the e-mail was considered spam (1) or not (0).
Most of the attributes indicate whether a particular word or character was frequently occuring in the e-mail. 
The run-length attributes (55-57) measure the length of sequences of consecutive capital letters. 
See [here](http://archive.ics.uci.edu/ml/datasets/Spambase) for more details.

| Attribute | Description |
| :- | -: |
| Attributes 0--47 | 48 continuous attributes of type word_freq_WORD = percentage of words in the e-mail that match WORD, i.e. 100 * (number of times the WORD appears in the e-mail) / total number of words in e-mail | 
| Attributes 48--53 | 6 continuous attributes of type char_freq_CHAR= percentage of characters in the e-mail that match CHAR, i.e. 100 * (number of CHAR occurences) / total characters in e-mail |
| Attribute 54 | 1 continuous  attribute of type capital_run_length_average = average length of uninterrupted sequences of capital letters |
| Attribute 55 | 1 integer attribute of type capital_run_length_longest = length of longest uninterrupted sequence of capital letters |
| Attribute 56 | 1 integer attribute of type capital_run_length_total = sum of length of uninterrupted sequences of capital letters = total number of capital letters in the e-mail |
| label |  whether the e-mail was considered spam (1) or not (0) |


In this problem, you will fit a linear regression model for detecting e-mail spam.

In [3]:
X = data.iloc[:,0:57].to_numpy() # attributes
y = data['label'].to_numpy() # labels

In [4]:
# optional: add bias (the column of all ones)
X = np.c_[np.ones(len(y)),X]

## Part 1

Partition the data into a **training and test sets** (X_train,y_train) and (X_test,y_test). Don't forget to shuffle the data first.

In [6]:
# shuffle the dataset
permutation = np.random.permutation(range(len(y))) # random permutation
X = X[permutation]
y = y[permutation]
# train/test split
X_train,y_train = X[:3000],y[:3000]
X_test,y_test = X[3000:],y[3000:]

## Part 2

Use the data (X_train,y_train) to fit a linear regression model that predicts the vector y_train

In [7]:
theta = np.linalg.solve(X_train.T.dot(X_train),X_train.T.dot(y_train))

## Part 3

Test your spam detector on the test set

In [8]:
# use the model to predict y_test
y_test_pred = X_test.dot(theta)

In [9]:
# apply classification rule
y_test_pred[y_test_pred>=0.5]=1
y_test_pred[y_test_pred<0.5]=0

In [10]:
# confusion matrix
n_classes = 2
C = np.zeros((n_classes,n_classes))
for i in range(n_classes):
    for j in range(n_classes):
        C[i,j] = sum(y_test_pred[y_test==i]==j)
C

array([[914.,  47.],
       [129., 511.]])

In [11]:
# percentage of correct classifications
100*np.sum(y_test==y_test_pred)/len(y_test)

89.00687070580886