In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from cvxopt import matrix, solvers
from sklearn.metrics import classification_report

# Exploring the data

In [4]:
df = pd.read_csv('spambase.data', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [None]:
df.describe

In [None]:
df.info

In [5]:
# Calculating NULL percentage in each feature
((df.isnull().sum() / len(df))*100).sort_values(ascending=False).to_frame()

Unnamed: 0,0
0,0.0
43,0.0
31,0.0
32,0.0
33,0.0
34,0.0
35,0.0
36,0.0
37,0.0
38,0.0


In [None]:
# Plotting heatmap to see correlation between features
plt.figure(figsize=(16,20))
sns.heatmap(df.corr(),annot=True)

# Building SVM

## Pre-processing data

In [6]:
# Randomize the order of rows in dataset
df = df.reindex(np.random.permutation(df.index)) 
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
4322,0.0,0.0,0.91,0.0,0.3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.505,14,128,0
236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.103,3,32,1
1081,0.0,0.0,0.44,0.0,0.22,0.22,0.0,0.0,0.66,0.44,...,0.065,0.261,0.0,0.13,0.196,0.0,7.4,75,629,1
3786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.2,3,12,0
2803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.83,...,0.0,0.131,0.262,0.0,0.0,0.0,4.128,28,161,0


In [7]:
# reset the indices of df 
# It will help in partitioning the dataset further
df = df.reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.0,0.91,0.0,0.3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.505,14,128,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.103,3,32,1
2,0.0,0.0,0.44,0.0,0.22,0.22,0.0,0.0,0.66,0.44,...,0.065,0.261,0.0,0.13,0.196,0.0,7.4,75,629,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.2,3,12,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.83,...,0.0,0.131,0.262,0.0,0.0,0.0,4.128,28,161,0


In [9]:
# Separating X and y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# generating splitting indices for 70% separation
split_idx = int(0.7 * len(df))

In [21]:
# Splitting into train and test dataset
X_train = X[:split_idx]
X_test = X[split_idx:]
y_train = y[:split_idx]
y_test = y[split_idx:]

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3220, 57)
(1381, 57)
(3220,)
(1381,)


In [23]:
# Standardize the features
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

## Training the model

In [24]:
# SVM kernel functions
def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def quadratic_kernel(x1, x2):
    return (np.dot(x1, x2) ** 2)

def rbf_kernel(x1, x2, gamma=0.1):
    return np.exp(-gamma * np.linalg.norm(x1 - x2) ** 2)

In [71]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None


    def fit(self, X, y):
        n_samples, n_features = X.shape

        y_ = np.where(y <= 0, -1, 1)

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]


    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        if np.sign(approx) <=0:
            return 0
        else:
            return 1

In [72]:
svm=SVM()
svm.fit(X_train, y_train)

In [73]:
y_pred = []
for x in X_test:
    y_pred.append(svm.predict(x))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       850
           1       0.91      0.87      0.89       531

    accuracy                           0.92      1381
   macro avg       0.92      0.91      0.91      1381
weighted avg       0.92      0.92      0.92      1381



In [59]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

clf = svm.SVC(kernel='linear', C=1)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy (Linear): {acc}")

clf = svm.SVC(kernel='poly', C=1)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy (Quadratic): {acc}")

clf = svm.SVC(kernel='rbf', C=1)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy (RBF): {acc}")

Accuracy (Linear): 0.9297610427226647
Accuracy (Quadratic): 0.776249094858798
Accuracy (RBF): 0.9333816075307748
