# Importing Dependancies

In [1]:
import numpy as np
import pandas as pd
import warnings
import copy
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

# Read in Dataset

In [2]:
# read data into dataframe
df = pd.read_csv("./OnlineNewsPopularity.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

# Preprocessing

### Split into data and target labels

In [3]:
y = df.iloc[:, 60]
x = df.iloc[:, 0:60]

In [4]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 60 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

### Drop Unused Features

In [5]:
# drop unused features
vals = [0, 1, 4, 5, 6]
for i in range(13, 39):
    vals.append(i)

x = x.drop(x.columns[vals], axis = 1)
x.head()

Unnamed: 0,n_tokens_title,n_tokens_content,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,LDA_00,LDA_01,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,4.0,2.0,1.0,0.0,4.680365,5.0,0.500331,0.378279,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,3.0,1.0,1.0,0.0,4.913725,4.0,0.799756,0.050047,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,3.0,1.0,1.0,0.0,4.393365,6.0,0.217792,0.033334,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,9.0,0.0,1.0,0.0,4.404896,7.0,0.028573,0.4193,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,19.0,19.0,20.0,0.0,4.682836,7.0,0.028633,0.028794,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


### Shuffle Data

In [6]:
np.random.seed(42)

In [7]:
x = x.to_numpy()
y = y.to_numpy()

In [8]:
arr = np.arange(len(df))
np.random.shuffle(arr)
x = x[arr]
y = y[arr]

### Split data

In [9]:
x_train = x[:31715]
y_train = y[:31715]
x_val = x[31715:35679]
y_val = y[31715:35679]
x_test = x[35679:39644]
y_test = y[35679:39644]

### Standardize Data

In [10]:
# first save the training mean and std
training_mean = np.mean(x_train, axis=0)
training_std = np.std(x_train, axis=0)

# first standardize the training data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

In [11]:
# standardize the test and validation data using training mean and std
x_test = (x_test - training_mean) / training_std
x_val = (x_val - training_mean) / training_std

# Augment data

In [12]:
# finally, augment data using a 1's column
ones_train = np.ones((len(x_train), 1))
ones_test = np.ones((len(x_test), 1))
ones_val = np.ones((len(x_val), 1))

In [13]:
x_train = np.hstack((ones_train, x_train))
x_test = np.hstack((ones_test, x_test))
x_val = np.hstack((ones_val, x_val))

# Part1: Linear Regression using QR Factorization

### First get the Q matrix

In [14]:
Q = []
cols = x_train.shape[1]
for i in range(cols):
    # get initial u
    u = x_train[:, i]
    # iteratively subtract projections 
    for j in range(i):
        proj = np.dot(x_train[:, i], Q[j]) / np.dot(Q[j], Q[j])
        prod = proj * Q[j]
        u = u - prod
    Q.append(u)


# turn it into a numpy object
Q = np.array(Q)
# transpose it so that the computed vectors are now column vectors
Q = Q.T

In [15]:
Q

array([[ 1.        ,  0.76066508,  0.99323414, ..., -0.380303  ,
         0.51625051,  0.10543767],
       [ 1.        , -0.65736617, -0.075949  , ..., -0.03853357,
         0.30884834, -0.03024183],
       [ 1.        , -0.65736617, -0.65535167, ...,  0.67256689,
         0.33261947, -0.18369264],
       ...,
       [ 1.        , -0.65736617,  0.92036704, ..., -0.12948242,
         0.24828613, -0.14947669],
       [ 1.        ,  0.28798799,  3.05636261, ..., -2.27291013,
        -0.50176845,  0.6460148 ],
       [ 1.        ,  1.23334216, -0.62732476, ...,  0.00744116,
        -0.68531192, -0.01831509]])

### Next, get the R matrix

We can start with a d+1 identity matrix, and iteratively fill out the upper diagonal with projections.

In [16]:
R = np.identity(cols)

for i in range(0, cols):
    for j in range(i+1, cols):
        R[i][j] = np.dot(x_train[:, j], Q[:, i]) / np.dot(Q[:, i], Q[:, i])
        

In [17]:
R

array([[ 1.00000000e+00,  2.83186510e-16,  8.15505457e-17,
        -5.06330311e-17,  4.48079922e-19, -5.32767027e-16,
         2.18214922e-16,  2.67046672e-14,  1.71166530e-16,
         1.07785625e-15,  5.32005291e-15,  2.78168015e-15,
         4.11852660e-15, -4.55562856e-15,  4.38497733e-14,
         5.16927402e-15, -2.00576256e-14, -1.45446743e-15,
        -6.33965877e-14,  2.55385392e-14,  1.72869234e-14,
         2.92999461e-13,  2.05016280e-13,  4.18517849e-15,
         4.12121508e-15, -1.35575542e-14,  1.15223752e-14,
         6.60469805e-15,  2.47429733e-15,  3.60788352e-15],
       [ 0.00000000e+00,  1.00000000e+00,  1.79697637e-02,
        -5.14268633e-02, -1.39518238e-02, -1.16355307e-02,
         5.24151355e-02, -7.11026642e-02, -7.23506518e-03,
        -7.14735047e-02,  6.59400538e-02,  4.11007356e-02,
         3.82800760e-02, -6.46411036e-02, -5.93214427e-02,
        -7.22143692e-02, -6.63649047e-02,  1.38040488e-02,
        -6.52650990e-02,  3.18683097e-02, -5.15223644e-

Now that we got Q and R, we can check if the QR factorization was done correctly. For QR factorization, the training data D will be 

D = Q * R

In [18]:
matr = np.matmul(Q, R)

In [19]:
matr

array([[ 1.        ,  0.76066508,  1.00690312, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.08776171, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.66716439, ..., -0.26330178,
         0.83711369, -0.69029101],
       ...,
       [ 1.        , -0.65736617,  0.90855432, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        ,  0.28798799,  3.06153769, ..., -1.76983497,
        -0.75038339,  1.07998198],
       [ 1.        ,  1.23334216, -0.60516189, ...,  0.39580649,
        -0.88267481,  0.08420343]])

In [20]:
x_train

array([[ 1.        ,  0.76066508,  1.00690312, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.08776171, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.66716439, ..., -0.26330178,
         0.83711369, -0.69029101],
       ...,
       [ 1.        , -0.65736617,  0.90855432, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        ,  0.28798799,  3.06153769, ..., -1.76983497,
        -0.75038339,  1.07998198],
       [ 1.        ,  1.23334216, -0.60516189, ...,  0.39580649,
        -0.88267481,  0.08420343]])

The Product of the Q and R matrices and the x_train matrix are the same, so the Factorization was done correctly. 

### Getting the Delta Matrix

The delta matrix is simply the diagonal matrix of the squared norms of the Q column vectors. 

In [21]:
# first create the vector, which will be the diagonal
delta_diagonal = []
for i in range(len(Q.T)):
    dot = np.dot(Q[:, i], Q[:, i])
    delta_diagonal.append(dot)

delta_diagonal = np.array(delta_diagonal)

# now make it into a diagonal matrix
delta = np.diag(delta_diagonal)

### Getting Delta Inverse

In [22]:
delta_inv = np.linalg.inv(delta)

### Getting R inverse

In [23]:
R_inv = np.linalg.inv(R)

### Solve for the weight vector

In [24]:
t1 = np.dot(Q.T, y_train)

In [25]:
t2 = np.dot(delta_inv, t1)

In [26]:
weights = np.dot(R_inv, t2)

In [27]:
weights

array([ 3.40671464e+03,  1.14979938e+02,  4.34234448e+01,  4.51159020e+02,
       -2.55131495e+02,  1.60766995e+02,  6.92344495e+01, -2.53262476e+02,
        2.09448716e+02,  4.34507970e+13,  3.65782466e+13,  4.66326922e+13,
        4.87532444e+13,  4.77916757e+13,  5.34998741e+02, -3.20966704e+01,
       -1.55606138e+02,  3.59904592e+01, -2.44292194e+02, -1.76580240e+02,
        1.54656846e+00, -9.25274573e+01, -4.42398271e+01, -2.01417049e+02,
        6.68041714e+01, -1.05269467e+02, -2.30375389e+01,  4.45946043e+01,
        6.01789544e+01,  1.55688861e+02])

### Calculate the Y-hat values

In [28]:
y_pred = np.matmul(x_test, weights)

### Calculate SSE

In [29]:
diff = y_test - y_pred
SSE = np.dot(diff, diff)

In [30]:
SSE

2.733760965153207e+28

### Calculate TSS

In [31]:
y_mu = np.mean(y_test)
diff2 = y_test - y_mu
TSS = np.dot(diff2, diff2)

In [32]:
TSS

277008716210.6516

# Calculate the R^2 metric

In [33]:
R2 = (TSS - SSE) / TSS

In [34]:
R2

-9.868862621183064e+16

# Part2: Linear Regression with Regularization

The x_train is already the augmented data matrix (done in the preprocessing step)

We'll keep doing the Ridge Regression algorithm with various values of alpha, and try and see which one produces the best weight vector using the SSE metric. 

In [47]:
# function that implements the ridge regression gradient descent algorithm
# using a given alpha value
# return the weight vector upon reaching convergence
def RidgeRegression(alpha):

    # establish constants
    t = 0
    alpha = 100000
    eta = 1e-6

    # create initial random weight vector
    w = np.random.uniform(0, 100, len(x_train.T))
    w_old = copy.deepcopy(w)
    
    # ridge regression algorithm
    while(True):
        # calculate gradient
        term1 = -1 * np.matmul(x_train.T, y_train)
        term2 = np.matmul(np.matmul(x_train.T, x_train), w)
        term3 = alpha * w
        gradient = term1 + term2 + term3
    
        # update weight vector
        w_old = copy.deepcopy(w)
        w = w_old - (eta * gradient)
    
        # check for convergence
        if np.linalg.norm((w - w_old)) <= 0.001:
            break
        
        t += 1
    
    return w

Create a list of alpha values to try

In [57]:
alphas = [0, 1, 10, 100, 1000, 10000]

Run the Ridge Regression on each alpha value and calculate the SSE values. 

In [58]:
SSE_array = []
for i in range(len(alphas)):
    # get the wieght vector
    weight_vector = RidgeRegression(alphas[i])
    
    # calculate y-hat using the validation set
    y_hat = np.dot(x_val, weight_vector)
    
    # calculate the SSE
    diff = y_val - y_hat
    ridge_SSE = np.dot(diff, diff)
    SSE_array.append(ridge_SSE)
    

The best alpha value is the one that yeilds the minimum SSE value

In [59]:
min_SSE = min(SSE_array)
index = SSE_array.index(min_SSE)
ideal_alpha = alphas[index]
print("Best Alpha Value:", ideal_alpha)

Best Alpha Value: 100
