In [1]:
import numpy as np

### Read the data

In [2]:
def read_in(file_path):
    X = []
    y = []
    with open(file_path, 'r') as f:
        for line in f:
            info = line.strip('\n').split(',')
            # label is the first column
            X.append([float(i) for i in info[:-1]])
            y.append(float(info[-1]))
    X = np.array(X)
    y = np.array(y)
    # change the output to *-1 and 1* instead of *0 and 1*
    y[y == 0] = -1
    return X, y

In [3]:
X_train, y_train = read_in('hw4_data/spam_train.data')
X_test, y_test = read_in('hw4_data/spam_test.data')
X_valid, y_valid = read_in('hw4_data/spam_validation.data')

In [4]:
print(X_train[2].shape)

(57,)


In [5]:
print(y_train)

[ 1.  1.  1. ..., -1. -1. -1.]


### Find the Principal Components

In [6]:
def normalize_data(X, X_train):
    X_norm = X - X_train.mean(axis=0)
    X_norm = X_norm / X_train.std(axis=0)
    return X_norm

In [7]:
# construct the W matrix with zero mean
W = normalize_data(X_train, X_train)
W_valid = normalize_data(X_valid, X_train)
W_test = normalize_data(X_test, X_train)

In [8]:
# use SVD to find the eigenvectors
u,s,v = np.linalg.svd(W.T)

In [9]:
u.shape

(57, 57)

In [10]:
# choose the top k eigenvectors
U = u[:,:6]

In [11]:
U.shape

(57, 6)

In [12]:
# transform data
X_tran = W.dot(U)

In [13]:
X_tran[1]

array([ 1.13787505, -1.52833793, -0.16534567, -0.15784105,  0.35479145,
       -0.28510092])

### PCA using covariance matrix

In [16]:
# construct the W matrix with zero mean and unit variance
W = normalize_data(X_train, X_train)

In [17]:
cov_mat = np.cov(W.T)
cov_mat.shape

(57, 57)

In [18]:
# cov_mat = W.dot(W.T)

In [19]:
# eig_vals, eig_vecs = np.linalg.eig(cov_mat)
eig_vals, eig_vecs = np.linalg.eigh(cov_mat)

In [20]:
eig_vals.shape

(57,)

In [21]:
eig_vecs.shape

(57, 57)

In [22]:
print(eig_vals)

[ 0.00819714  0.12715707  0.14992838  0.20664016  0.21192009  0.30467746
  0.33500951  0.35544926  0.39232605  0.42174014  0.44831298  0.5392283
  0.55391231  0.56939916  0.58346395  0.63172972  0.63971963  0.66932111
  0.68527664  0.70649048  0.7125208   0.72509491  0.74551225  0.76626027
  0.81660999  0.82424393  0.82612169  0.84271298  0.85213603  0.86027004
  0.89324831  0.9110594   0.93073701  0.95236598  0.97023288  0.98137667
  0.994403    1.00235114  1.00683272  1.02783985  1.05525217  1.05954896
  1.08962648  1.12274881  1.13823278  1.2281143   1.23492607  1.30402369
  1.40200761  1.41980701  1.51572868  1.59257407  1.63959314  1.84721798
  2.25116837  3.26013288  6.67647395]


In [23]:
# sort eigenvalues and eigenvectors
idx = eig_vals.argsort()[::-1]   
eig_vals = eig_vals[idx]
eig_vecs = eig_vecs[:,idx]

In [24]:
print(eig_vals)

[ 6.67647395  3.26013288  2.25116837  1.84721798  1.63959314  1.59257407
  1.51572868  1.41980701  1.40200761  1.30402369  1.23492607  1.2281143
  1.13823278  1.12274881  1.08962648  1.05954896  1.05525217  1.02783985
  1.00683272  1.00235114  0.994403    0.98137667  0.97023288  0.95236598
  0.93073701  0.9110594   0.89324831  0.86027004  0.85213603  0.84271298
  0.82612169  0.82424393  0.81660999  0.76626027  0.74551225  0.72509491
  0.7125208   0.70649048  0.68527664  0.66932111  0.63971963  0.63172972
  0.58346395  0.56939916  0.55391231  0.5392283   0.44831298  0.42174014
  0.39232605  0.35544926  0.33500951  0.30467746  0.21192009  0.20664016
  0.14992838  0.12715707  0.00819714]


In [25]:
U = eig_vecs[:,:6]
U.shape

(57, 6)

In [26]:
W.shape

(3000, 57)

In [27]:
# transform data
X_tran = W.dot(U)

In [28]:
X_tran[1]

array([-1.13787505,  1.52833793, -0.16534567, -0.15784105,  0.35479145,
       -0.28510092])

### Test using sklearn

In [49]:
# construct the W matrix with zero mean and unit variance
W = normalize_data(X_train, X_train)

In [50]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=6)
X_sk = pca.fit_transform(W)

In [51]:
X_sk.shape

(3000, 6)

In [52]:
X_sk[1]

array([-1.13755467,  1.53115676,  0.18911439,  0.14812845, -0.08693231,
       -0.23795884])

### Train SVM classifiers using PCA

In [14]:
from sklearn.svm import LinearSVC

In [54]:
for k in range(1,6):
    for c in [1, 10, 100, 1000]:      
        U = eig_vecs[:,:k]
        X_tran = W.dot(U)
        clf = LinearSVC(C=c, random_state=0)
        clf.fit(X_tran, y_train)
        X_valid_tran = W_valid.dot(U)
        preds = clf.predict(X_valid_tran)
        error = 1 - np.mean(preds==y_valid)
        print('For k = ' + str(k) + ' and c = ' + str(c) +
                 ', valid error is ' + str(error))    

For k = 1 and c = 1, valid error is 0.32125
For k = 1 and c = 10, valid error is 0.31875
For k = 1 and c = 100, valid error is 0.32125
For k = 1 and c = 1000, valid error is 0.0225
For k = 2 and c = 1, valid error is 0.16875
For k = 2 and c = 10, valid error is 0.16875
For k = 2 and c = 100, valid error is 0.17875
For k = 2 and c = 1000, valid error is 0.13
For k = 3 and c = 1, valid error is 0.17875
For k = 3 and c = 10, valid error is 0.17875
For k = 3 and c = 100, valid error is 0.1425
For k = 3 and c = 1000, valid error is 0.36375
For k = 4 and c = 1, valid error is 0.17875
For k = 4 and c = 10, valid error is 0.17625
For k = 4 and c = 100, valid error is 0.2275
For k = 4 and c = 1000, valid error is 0.46375
For k = 5 and c = 1, valid error is 0.18125
For k = 5 and c = 10, valid error is 0.18125
For k = 5 and c = 100, valid error is 0.1675
For k = 5 and c = 1000, valid error is 0.15875


In [30]:
# best k/c pair is k=1, c=10
# Note: accuracy changes randomly
k = 1
c = 1000
U = eig_vecs[:,:k]
X_tran = W.dot(U)
clf = LinearSVC(C=c,random_state=0)
clf.fit(X_tran, y_train)
X_test_tran = W_test.dot(U)
preds = clf.predict(X_test_tran)
error = 1 - np.mean(preds==y_test)
print('Using k = 1 and c = 1000, test error is ' + str(error))

Using k = 1 and c = 1000, test error is 0.00499375780275


In [34]:
# compare with the SVM without PCA
print('Training SVM without PCA...')
for c in [1, 10, 100, 1000]:
    clf = LinearSVC(C=c, random_state=0)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_valid)
    error = 1 - np.mean(preds==y_valid)
    print('For c = ' + str(c) +
         ', valid error is ' + str(error))   

Training SVM without PCA...
For c = 1, valid error is 0.03125
For c = 10, valid error is 0.16125
For c = 100, valid error is 0.03125
For c = 1000, valid error is 0.01125


In [36]:
# best c = 1000
c = 1000
clf = LinearSVC(C=c, random_state=0)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
error = 1 - np.mean(preds==y_test)
print('With c = 1000, test set error is ' + str(error))

With c = 1000, test set error is 0.0387016229713


Getting better result with PCA!