# Importing Dependancies

In [26]:
import numpy as np
import pandas as pd
import warnings
import copy
import random
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

# Read in Dataset

In [2]:
# read data into dataframe
df = pd.read_csv("./OnlineNewsPopularity.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

# Preprocessing

### Split into data and target labels

In [3]:
y = df.iloc[:, 60]
x = df.iloc[:, 0:60]

### Independant Variables

In [4]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 60 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

### Dependant Variables

In [5]:
# y.info()

### Drop Unused Features

In [6]:
# drop unused features
vals = [0, 1, 4, 5, 6]
for i in range(13, 39):
    vals.append(i)

x = x.drop(x.columns[vals], axis = 1)
x.head()

Unnamed: 0,n_tokens_title,n_tokens_content,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,LDA_00,LDA_01,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,4.0,2.0,1.0,0.0,4.680365,5.0,0.500331,0.378279,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,3.0,1.0,1.0,0.0,4.913725,4.0,0.799756,0.050047,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,3.0,1.0,1.0,0.0,4.393365,6.0,0.217792,0.033334,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,9.0,0.0,1.0,0.0,4.404896,7.0,0.028573,0.4193,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,19.0,19.0,20.0,0.0,4.682836,7.0,0.028633,0.028794,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


### Shuffle Data

In [7]:
np.random.seed(42)

In [8]:
# convert to numpy objects
x = x.to_numpy()
y = y.to_numpy()

In [9]:
# shuffle
arr = np.arange(len(df))
np.random.shuffle(arr)
x = x[arr]
y = y[arr]

### Splitting Data

In [10]:
x_train = x[:1000].copy()
y_train = y[:1000].copy()
x_val = x[1000:2000].copy()
y_val = y[1000:2000].copy()
x_test = x[2000:3000].copy()
y_test = y[2000:3000].copy()

### Standardize Data

In [11]:
# first save the training mean and std
training_mean = np.mean(x_train, axis=0)
training_std = np.std(x_train, axis=0)

# first standardize the training data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
# standardize the test and validation data using training mean and std
x_test = (x_test - training_mean) / training_std
x_val = (x_val - training_mean) / training_std

### Clamping Data
I'll clamp the shares attribute to be at most 6000.

In [12]:
for i in range(1000):
    if y_train[i] > 6000:
        y_train[i] = 6000
    if y_test[i] > 6000:
        y_test[i] = 6000
    if y_val[i] > 6000:
        y_val[i] = 6000

### Map target data into two distinct classes
+1 is shares >= 2000 and -1 is shares < 2000

In [13]:
for i in range(1000):
    if y_train[i] >= 2000:
        y_train[i] = 1
    else:
        y_train[i] = -1
        
    if y_test[i] >= 2000:
        y_test[i] = 1
    else:
        y_test[i] = -1
        
    if y_val[i] >= 2000:
        y_val[i] = 1
    else:
        y_val[i] = -1

# Computing the Kernel Matrices
I will compute the linear and gaussian kernel and use each one to implement the Dual SVM algorithm  

### Linear Kernel

In [14]:
D = x_train.copy()

In [15]:
linear_kernel = np.dot(D, D.T)

In [16]:
linear_kernel

array([[23.56861317, -1.78768508, -9.04492404, ...,  8.87583137,
         6.88528597,  3.20230281],
       [-1.78768508, 12.57868004,  1.10172574, ..., -6.69551931,
        -2.99318777, -0.90015711],
       [-9.04492404,  1.10172574, 58.1960898 , ..., -8.16555486,
         3.26771302, -3.40711312],
       ...,
       [ 8.87583137, -6.69551931, -8.16555486, ..., 62.61216466,
        -1.25606695,  5.56410057],
       [ 6.88528597, -2.99318777,  3.26771302, ..., -1.25606695,
        17.72336005, -2.0619736 ],
       [ 3.20230281, -0.90015711, -3.40711312, ...,  5.56410057,
        -2.0619736 , 23.86693631]])

### Gaussian Kernel

Get the vector of squared norms

In [17]:
S = np.linalg.norm(D, axis = 1)
S = S * S

Set variance

In [18]:
variance = 10000

Compute Gaussian Kernel

In [19]:
gaussian_kernel = np.exp( ((2 * np.matmul(D, D.T)) - S - S[:,None]) / (2*variance) )
gaussian_kernel

array([[1.        , 0.99801584, 0.99501972, ..., 0.99658439, 0.99862488,
        0.99795056],
       [0.99801584, 1.        , 0.9965773 , ..., 0.9955807 , 0.99818722,
        0.99808953],
       [0.99501972, 0.9965773 , 1.        , ..., 0.99316649, 0.99653681,
        0.995566  ],
       ...,
       [0.99658439, 0.9955807 , 0.99316649, ..., 1.        , 0.99586618,
        0.99623954],
       [0.99862488, 0.99818722, 0.99653681, ..., 0.99586618, 1.        ,
        0.9977169 ],
       [0.99795056, 0.99808953, 0.995566  , ..., 0.99623954, 0.9977169 ,
        1.        ]])

### SVM Algorithm Implementation 

In [20]:
def dual_SVM(D, kernel, C, epsilon):
    # augmented kernel matrix
    kernel_augmented = kernel + 1
    
    # make vector of step sizes
    eta = []
    for i in range(len(kernel_augmented)):
        val = 1 / kernel_augmented[i][i]
        eta.append(val)
    eta = np.array(eta)
    # initialize alpha vector
    t = 0
    alpha = np.random.uniform(0, 1, len(kernel_augmented))
    old_alpha = alpha.copy()
    while(True):
        # iterate thru n in random order
        r = list(range(len(kernel_augmented)))
        random.shuffle(r)
        # update the kth component in alpha
        for k in r:
            # first calculate the gradient
            # do the summation
            summation = 0
            for i in range(len(kernel)):
                summation += (alpha[i] * y_train[i] * kernel_augmented[i][k])
            # print(summation)
            # multiply the gradient by the step size
            t = eta[k] * (1 - (y_train[k] * summation))
            
            # update alpha in place
            alpha[k] = alpha[k] + t
            # finally clamp alpha between 0 and C
            if alpha[k] < 0:
                alpha[k] = 0
            if alpha[k] > C:
                alpha[k] = C
        
        t += 1
        # print(np.linalg.norm(alpha - old_alpha))
        if np.linalg.norm(alpha - old_alpha) < epsilon:
            break
        
        old_alpha = alpha.copy()
    
    
    return alpha

### Finding the Best Hyperparameter

Use the validation set to try some C values. First use the linear kernel to find the best C. 

In [27]:
accuracies = []
c_values = [1e-2, 1e-3, 1e-4, 1e-5]
for i in range(len(c_values)):
    c = c_values[i]
    # get the alpha vector
    a = dual_SVM(D, linear_kernel, c, 1e-3)
    # make predictions using the validation set
    K_new = np.dot(x_train, x_val.T)
    y_pred = []
    for j in range(1000):
        v = np.sum(K_new[:, j]) * y_train[j] * a[j]
        if v >= 0:
            y_pred.append(1)
        else:
            y_pred.append(-1)
            
    # compute accuracy score
    acc = accuracy_score(y_pred, y_val)
    accuracies.append(acc)
    

In [30]:
max_acc = max(accuracies)
ideal_c = c_values[accuracies.index(max_acc)]
print("maximum accuracy:", max_acc)
print("Ideal C value:", ideal_c)

maximum accuracy: 0.49
Ideal C value: 0.001


#### Ideal Hyperparameters

The ideal variance was found in assign3 to be either 10,000 or 100,000. In this assignment, I will use a variance of 10,000 and the ideal C value found above to calculate the predictions and accuracies on the test data.

### Making Predictions on the Test Set

First, the linear kernel.

In [31]:
c = 1e-3
a = dual_SVM(D, linear_kernel, c, 1e-3)

In [32]:
K_new = np.dot(x_train, x_test.T)

In [33]:
y_pred = []
for i in range(1000):
    pred = np.sum(K_new[:, i]) * y_train[i] * a[i]
    if pred >= 0:
        y_pred.append(1)
    else:
        y_pred.append(-1)
        
# compute accuracy score
acc = accuracy_score(y_pred, y_test)
print("Accuracy using Linear Kernel:", acc)

Accuracy using Linear Kernel: 0.477


Next, the gaussian kernel

In [34]:
c = 1e-3
a = dual_SVM(D, gaussian_kernel, c, 1e-3)

In [35]:
K_new = np.dot(x_train, x_test.T)

In [36]:
y_pred = []
for i in range(1000):
    pred = np.sum(K_new[:, i]) * y_train[i] * a[i]
    if pred >= 0:
        y_pred.append(1)
    else:
        y_pred.append(-1)
        
# compute accuracy score
acc = accuracy_score(y_pred, y_test)
print("Accuracy using Gaussian Kernel:", acc)

Accuracy using Linear Kernel: 0.478


# Exam 1 Question

Question 5:

![q5%20remastered.PNG](attachment:q5%20remastered.PNG)