In [220]:
import cvxpy as cp
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat

In [221]:
spamTrain_sample = loadmat('spamTrain.mat')
X = spamTrain_sample['X'] # (4000, 1899)
y_label = spamTrain_sample['y'] # (4000, 1)
y_label = np.int_((y_label - 0.5)*2)

num_train = 4000

X = X[0:num_train, :]
y_label = y_label[0:num_train, :]

print("Shape of X: ", X.shape)
print("Shape of y_label: ", y_label.shape)
y_label = y_label.reshape(y_label.shape[0])
print("Shape of y_label after reshape: ", y_label.shape)


Shape of X:  (4000, 1899)
Shape of y_label:  (4000, 1)
Shape of y_label after reshape:  (4000,)


In train file labels are 0 and 1.
We have changed to -1 and 1

In [222]:
C = 0.01
k = 2

b = cp.Variable(1)
n_feat = X.shape[1]
n_train = X.shape[0]

M = X.T

print("Shape of M (X.T): ", M.shape)

w = cp.Variable(n_feat)
zeta = cp.Variable(n_train)

print("This is primal form of soft margin for spam classification")

objective_soft = cp.Minimize(0.5*cp.norm(w,2)**2 + C*(cp.sum(zeta))**k)

constraints_soft = [
    zeta >= 0,
    cp.multiply(y_label, (w.T@M + b)) >= 1 - zeta,
]

problem_soft = cp.Problem(objective_soft, constraints_soft)
problem_soft.solve(solver=cp.SCS)

Shape of M (X.T):  (1899, 4000)
This is primal form of soft margin for spam classification


12.465617513863691

In [223]:
print("Shape of W: ", w.value.shape)
print("Value of W: ", w.value)
print("Value of W[1]: ", w.value[1])
print("Value of b: ", b.value)
print("Zeta value rounded to 3 decimal: ")
print(np.round(zeta.value,3))
# i1 = 0
# i2 = 1
# x = np.linspace(-30, 30, 100)
# print(-(w.value[i1]*x + b.value)/w.value[i2])

Shape of W:  (1899,)
Value of W:  [-0.01697897  0.02273248  0.04727581 ... -0.13315491 -0.00770544
  0.08025466]
Value of W[1]:  0.022732481617247193
Value of b:  [0.32833102]
Zeta value rounded to 3 decimal: 
[-0. -0. -0. ... -0. -0. -0.]


In [224]:
def accuracyCal(y_label, X, w, b):
    y_pred = w@X+b > 0
    y_pred = 1*y_pred
    return np.mean(y_label == y_pred)

In [225]:
spamTest_sample = loadmat('spamTest.mat')
# print(spamTest_sample)

X_test = spamTest_sample['Xtest'].T # (4000, 1899)
y_label_test = spamTest_sample['ytest'] # (4000, 1)
y_label_test = y_label_test.reshape(y_label_test.shape[0])

print("Shape of X test: ", X_test.shape)
print("Shape of y_label test: ", y_label_test.shape)

# print(y_label_test)

print("Shape of W: ", w.value.shape)
print("Value of b: ", b.value)
print("Value of W: ", w.value)
# accuracy = np.mean(np.round(y_label_test) == np.round(w.value.T@X_test + b.value))
# print(np.round(w.value.T@X.T + b.value))

print("Accuracy on test ", accuracyCal(y_label_test, X_test, w.value, b.value))
print("Mean of test label ", np.mean(y_label_test))

y_label2 = y_label/2+0.5

print("Accuracy on Train ", accuracyCal(y_label2, X.T, w.value, b.value))

Shape of X test:  (1899, 1000)
Shape of y_label test:  (1000,)
Shape of W:  (1899,)
Value of b:  [0.32833102]
Value of W:  [-0.01697897  0.02273248  0.04727581 ... -0.13315491 -0.00770544
  0.08025466]
Accuracy on test  0.981
Mean of test label  0.308
Accuracy on Train  0.99975
