# Lecture : Graph SVM

## Lab 01 : Standard/Linear SVM -- Exercise

### Xavier Bresson, Guoji Fu 


In [None]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/CS5284_2024_codes/codes/04_Graph_SVM'
    print(path_to_file)
    # change current path to the folder containing "path_to_file"
    os.chdir(path_to_file)
    !pwd
    

In [None]:
# Load libraries
import numpy as np
import scipy.io
%matplotlib inline
#%matplotlib notebook 
from matplotlib import pyplot
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
from IPython.display import display, clear_output
import time
import sys; sys.path.insert(0, 'lib/')
from lib.utils import compute_purity
import warnings; warnings.filterwarnings("ignore")


# Linearly separable data points

In [None]:
# Dataset
mat = scipy.io.loadmat('datasets/data_linearSVM.mat')
Xtrain = mat['Xtrain']
Cgt_train = mat['Cgt_train'] - 1; Cgt_train = Cgt_train.squeeze()
l_train = mat['l'].squeeze()
n = Xtrain.shape[0]
d = Xtrain.shape[1]
nc = len(np.unique(Cgt_train))
print(n,d,nc)
Xtest = mat['Xtest']
Cgt_test = mat['Cgt_test'] - 1; Cgt_test = Cgt_test.squeeze()


In [None]:
# Plot
plt.figure(figsize=(8,4))
p1 = plt.subplot(121)
size_vertex_plot = 100
plt.scatter(Xtrain[:,0], Xtrain[:,1], s=size_vertex_plot*np.ones(n), c=Cgt_train, color=pyplot.jet())
plt.title('Training Data')
p2 = plt.subplot(122)
size_vertex_plot = 100
plt.scatter(Xtest[:,0], Xtest[:,1], s=size_vertex_plot*np.ones(n), c=Cgt_test, color=pyplot.jet())
plt.title('Test Data')
plt.tight_layout()
plt.show()


**Question 1: Implement the linear SVM on linear separable data using the primal-dual iterative algorithm**

*Hint:* Following Page 18-20, Lecture 4 Slides

**Step 1:** Compute the Linear Kernel $Ker$ and $L, Q$ defined as
- $Ker= XX^\top$,
- $L = \text{diag}(l)$, 
- $Q = LKL$.
 
You may use function `np.diag()`, the transpose operator `.T`, and the matrix-matrix multiplication operator `.dot()`.
  

In [None]:
# Compute linear kernel, L, Q

l = l_train

############################################################################
# Your code start
############################################################################

Ker = 
L = 
Q = 

############################################################################
# Your code end
############################################################################


**Step 2:** Initialize $\alpha^{k=0} = 0_n$ and $\beta^{k=0} = 1_n$.

You may use function `np.zeros()` for initializing a zero vector, and function `np.ones()` for initializing a zero vector.


In [None]:
# Initialization
############################################################################
# Your code start
############################################################################

alpha = 
beta = 

############################################################################
# Your code end
############################################################################


**Step 3:** Choose the time steps $\tau_\alpha, \tau_\beta$ such that $\tau_\alpha\tau_\beta \leq \frac{1}{\|Q\| \cdot \|L\|}$.

Some feasible choices can be $\tau_\alpha = \frac{a}{\|Q\|}, \tau_\beta = \frac{b}{\|L\|}$, where $ab \leq 1$.
 
For example: $\tau_\alpha = \frac{1}{\|Q\|}, \tau_\beta = \frac{1}{\|L\|}$.

You may use `np.linalg.norm()` to compute the norm of a matrix.

Try to evaluate the performance of linear SVM with different choices of time steps.
  

In [None]:
# Time steps
############################################################################
# Your code start
############################################################################

tau_alpha = 
tau_beta = 

############################################################################
# Your code end
############################################################################


**Step 4:** Project alpha to $[0, +\infty]$ during the update of alpha and beta with conjuguate gradient.


In [None]:
# For conjuguate gradient
Acg = tau_alpha* Q + np.eye(n)

# Pre-compute J.K(Xtest) for test data and train data
LKXtest = L.dot(Xtrain.dot(Xtest.T))
LKXtrain = L.dot(Xtrain.dot(Xtrain.T)) 

# Initialization
alpha_old = alpha

# Loop
k = 0
diff_alpha = 1e6
num_iter = 201
while (diff_alpha>1e-3) and (k<num_iter):
    
    # Update iteration
    k += 1
    #print('k',k,num_iter,diff_alpha)
    
    # Update alpha
    # Approximate solution with conjuguate gradient
    b0 = alpha + tau_alpha* Q.dot(alpha) - tau_alpha* l* beta 
    alpha, _ = scipy.sparse.linalg.cg(Acg, b0, x0=alpha, tol=1e-3, maxiter=50)   
    
    # Projection of alpha on [0,+infty]
    ############################################################################
    # Your code start
    ############################################################################

    alpha

    ############################################################################
    # Your code here
    ############################################################################

    # Update beta
    beta = beta + tau_beta* l.T.dot(alpha)
    
    # Stopping condition
    diff_alpha = np.linalg.norm(alpha-alpha_old)
    alpha_old = alpha
    
    # Plot
    if not(k%5) or (diff_alpha<1e-3):
           
        # Approximate bias value 
        b = np.mean( l - alpha.T.dot(LKXtrain) )
        
        # Continuous score function
        f_test = alpha.T.dot(LKXtest) + b 

        # Binary classification function
        C_test = np.sign(f_test) # decision function in {-1,1}
        accuracy_test = compute_purity(0.5*(1+C_test),Cgt_test,nc) # 0.5*(1+C_test) in {0,1}

        # Plot
        plt.figure(figsize=(8,4))
        p1 = plt.subplot(121)
        plt.scatter(Xtest[:,0], Xtest[:,1], s=size_vertex_plot*np.ones(n), c=f_test, color=pyplot.jet())
        plt.title('Score function $s(x)=w^Tx+b$ \n iter=' + str(k)+ ', diff_alpha=' + str(diff_alpha)[:7])
        plt.colorbar()
        p2 = plt.subplot(122)
        plt.scatter(Xtest[:,0], Xtest[:,1], s=size_vertex_plot*np.ones(n), c=C_test, color=pyplot.jet())
        plt.title('Classification function $f(x)=sign(w^Tx+b)$\n iter=' + str(k) + ', acc=' + str(accuracy_test)[:5])
        plt.tight_layout()
        plt.colorbar()
        plt.show()
        if k<num_iter-1:
            clear_output(wait=True)   
        

# Non-linearly separable data points

In [None]:
# Dataset
mat = scipy.io.loadmat('datasets/data_twomoons_softSVM.mat')
Xtrain = mat['Xtrain']
Cgt_train = mat['C_train_errors'] - 1; Cgt_train = Cgt_train.squeeze()
Cgt_train[:250] = 0; Cgt_train[250:] = 1
l_train = mat['l'].squeeze()
n = Xtrain.shape[0]
d = Xtrain.shape[1]
nc = len(np.unique(Cgt_train))
print(n,d,nc)
Xtest = mat['Xtest']
Cgt_test = mat['Cgt_test'] - 1; Cgt_test = Cgt_test.squeeze()


In [None]:
# Plot
plt.figure(figsize=(10,4))
p1 = plt.subplot(121)
size_vertex_plot = 33
plt.scatter(Xtrain[:,0], Xtrain[:,1], s=size_vertex_plot*np.ones(n), c=Cgt_train, color=pyplot.jet())
plt.title('Training Data')
p2 = plt.subplot(122)
size_vertex_plot = 33
plt.scatter(Xtest[:,0], Xtest[:,1], s=size_vertex_plot*np.ones(n), c=Cgt_test, color=pyplot.jet())
plt.title('Test Data')
#plt.tight_layout()
plt.show()


**Question 2: Compute linear kernel, L, Q, time steps, initialization and projection of alpha as for Question 1**

- Compare the results with the linearly separable case and determine which performs better. 

- What strategy can be used to enhance the performance of SVM on non-linearly separable data? 


In [None]:
# Run Linear SVM

# Compute linear kernel, L, Q
Ker = # YOUR CODE HERE 
l = # YOUR CODE HERE 
L = # YOUR CODE HERE 
Q = # YOUR CODE HERE 

# Time steps
tau_alpha = # YOUR CODE HERE 
tau_beta = # YOUR CODE HERE 

# For conjuguate gradient
Acg = tau_alpha* Q + np.eye(n)

# Pre-compute J.K(Xtest) for test data
LKXtest = L.dot(Xtrain.dot(Xtest.T))
LKXtrain = L.dot(Xtrain.dot(Xtrain.T)) 

# Initialization
alpha = # YOUR CODE HERE 
beta = # YOUR CODE HERE 
alpha_old = alpha

# Loop
k = 0
diff_alpha = 1e6
num_iter = 201
while (diff_alpha>1e-3) and (k<num_iter):
    
    # Update iteration
    k += 1
    #print('k',k,num_iter,diff_alpha)
    
    # Update alpha
    # Approximate solution with conjuguate gradient
    b0 = alpha + tau_alpha* Q.dot(alpha) - tau_alpha* l* beta 
    alpha, _ = scipy.sparse.linalg.cg(Acg, b0, x0=alpha, tol=1e-3, maxiter=50)   
    alpha# YOUR CODE HERE  # Projection on [0,+infty]

    # Update beta
    beta = beta + tau_beta* l.T.dot(alpha)
    
    # Stopping condition
    diff_alpha = np.linalg.norm(alpha-alpha_old)
    alpha_old = alpha
    
    # Plot
    if not(k%5) or (diff_alpha<1e-3):
           
        # Approximate bias value #
        b = np.mean( l - alpha.T.dot(LKXtrain) )
        
        # Continuous score function
        f_test = alpha.T.dot(LKXtest) + b 

        # Binary classification function
        C_test = np.sign(f_test) # decision function in {-1,1}
        accuracy_test = compute_purity(0.5*(1+C_test),Cgt_test,nc) # 0.5*(1+C_test) in {0,1}

        # Plot
        size_vertex_plot = 33
        plt.figure(figsize=(12,4))
        p1 = plt.subplot(121)
        plt.scatter(Xtest[:,0], Xtest[:,1], s=size_vertex_plot*np.ones(n), c=f_test, color=pyplot.jet())
        plt.title('Score function $s(x)=w^Tx+b$ \n iter=' + str(k)+ ', diff_alpha=' + str(diff_alpha)[:7])
        plt.colorbar()
        p2 = plt.subplot(122)
        plt.scatter(Xtest[:,0], Xtest[:,1], s=size_vertex_plot*np.ones(n), c=C_test, color=pyplot.jet())
        plt.title('Classification function $f(x)=sign(w^Tx+b)$\n iter=' + str(k) + ', acc=' + str(accuracy_test)[:5])
        #plt.tight_layout()
        plt.colorbar()
        plt.show()
        if k<num_iter-1:
            clear_output(wait=True)     
        