# Importing Dependancies

In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

# Read in Dataset

In [2]:
# read data into dataframe
df = pd.read_csv("./OnlineNewsPopularity.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

# Preprocessing

### Split into data and target labels

In [3]:
y = df.iloc[:, 60]
x = df.iloc[:, 0:60]

In [4]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 60 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

### Drop Unused Features

In [5]:
# drop unused features
vals = [0, 1, 4, 5, 6]
for i in range(13, 39):
    vals.append(i)

x = x.drop(x.columns[vals], axis = 1)
x.head()

Unnamed: 0,n_tokens_title,n_tokens_content,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,LDA_00,LDA_01,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,4.0,2.0,1.0,0.0,4.680365,5.0,0.500331,0.378279,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,3.0,1.0,1.0,0.0,4.913725,4.0,0.799756,0.050047,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,3.0,1.0,1.0,0.0,4.393365,6.0,0.217792,0.033334,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,9.0,0.0,1.0,0.0,4.404896,7.0,0.028573,0.4193,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,19.0,19.0,20.0,0.0,4.682836,7.0,0.028633,0.028794,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


### Shuffle Data

In [6]:
np.random.seed(42)

In [7]:
x = x.to_numpy()
y = y.to_numpy()

In [8]:
arr = np.arange(len(df))
np.random.shuffle(arr)
x = x[arr]
y = y[arr]

### Split data

In [9]:
x_train = x[:31715]
y_train = y[:31715]
x_val = x[31715:35679]
y_val = y[31715:35679]
x_test = x[35679:39644]
y_test = y[35679:39644]

### Standardize Data

In [10]:
# first save the training mean and std
training_mean = np.mean(x_train, axis=0)
training_std = np.std(x_train, axis=0)

# first standardize the training data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

In [11]:
# standardize the test and validation data using training mean and std
x_test = (x_test - training_mean) / training_std
x_val = (x_val - training_mean) / training_std

# Augment data

In [12]:
# finally, augment data using a 1's column
ones_train = np.ones((len(x_train), 1))
ones_test = np.ones((len(x_test), 1))
ones_val = np.ones((len(x_val), 1))

In [13]:
x_train = np.hstack((ones_train, x_train))
x_test = np.hstack((ones_test, x_test))
x_val = np.hstack((ones_val, x_val))

# Part1: Linear Regression using QR Factorization

### First get the Q matrix

In [14]:
Q = []
cols = x_train.shape[1]
for i in range(cols):
    # get initial u
    u = x_train[:, i]
    # iteratively subtract projections 
    for j in range(i):
        proj = np.dot(x_train[:, i], Q[j]) / np.dot(Q[j], Q[j])
        prod = proj * Q[j]
        u = u - prod
    Q.append(u)


# turn it into a numpy object
Q = np.array(Q)
# transpose it so that the computed vectors are now column vectors
Q = Q.T

In [15]:
Q

array([[ 1.        ,  0.76066508,  0.99323414, ..., -0.38032909,
         0.51623674,  0.10543864],
       [ 1.        , -0.65736617, -0.075949  , ..., -0.03852483,
         0.30885073, -0.03024141],
       [ 1.        , -0.65736617, -0.65535167, ...,  0.67258597,
         0.33262519, -0.1836919 ],
       ...,
       [ 1.        , -0.65736617,  0.92036704, ..., -0.12948664,
         0.2482869 , -0.14947752],
       [ 1.        ,  0.28798799,  3.05636261, ..., -2.27290149,
        -0.50176896,  0.64601598],
       [ 1.        ,  1.23334216, -0.62732476, ...,  0.0074167 ,
        -0.68531803, -0.01831628]])

### Next, get the R matrix

We can start with a d+1 identity matrix, and iteratively fill out the upper diagonal with projections.

In [16]:
R = np.identity(cols)

for i in range(0, cols):
    for j in range(i+1, cols):
        R[i][j] = np.dot(x_train[:, j], Q[:, i]) / np.dot(Q[:, i], Q[:, i])
        

In [21]:
R

array([[ 1.00000000e+00,  1.97603245e-16,  8.33428654e-17,
        -4.72724317e-17, -3.04694347e-17, -4.69139678e-16,
         1.61756852e-16,  2.67127886e-14, -1.88193567e-17,
         1.08782603e-15,  5.32610199e-15,  2.77025412e-15,
         4.10866884e-15, -4.56010936e-15,  4.38665762e-14,
         5.15762394e-15, -2.00939200e-14, -1.43811251e-15,
        -6.34420678e-14,  2.55806587e-14,  1.72871474e-14,
         2.92216217e-13,  2.04928680e-13,  4.20153341e-15,
         4.14485130e-15, -1.35416474e-14,  1.15169982e-14,
         6.70820451e-15,  2.33158387e-15,  3.72163981e-15],
       [ 0.00000000e+00,  1.00000000e+00,  1.79697637e-02,
        -5.14268633e-02, -1.39518238e-02, -1.16355307e-02,
         5.24151355e-02, -7.11026642e-02, -7.23506518e-03,
        -7.14735047e-02,  6.59400538e-02,  4.11007356e-02,
         3.82800760e-02, -6.46411036e-02, -5.93214427e-02,
        -7.22143692e-02, -6.63649047e-02,  1.38040488e-02,
        -6.52650990e-02,  3.18683097e-02, -5.15223644e-

Now that we got Q and R, we can check if the QR factorization was done correctly. For QR factorization, the training data D will be 

D = Q * R

In [26]:
matr = np.matmul(Q, R)

In [27]:
matr

array([[ 1.        ,  0.76066508,  1.00690312, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.08776171, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.66716439, ..., -0.26330178,
         0.83711369, -0.69029101],
       ...,
       [ 1.        , -0.65736617,  0.90855432, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        ,  0.28798799,  3.06153769, ..., -1.76983497,
        -0.75038339,  1.07998198],
       [ 1.        ,  1.23334216, -0.60516189, ...,  0.39580649,
        -0.88267481,  0.08420343]])

In [28]:
x_train

array([[ 1.        ,  0.76066508,  1.00690312, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.08776171, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        , -0.65736617, -0.66716439, ..., -0.26330178,
         0.83711369, -0.69029101],
       ...,
       [ 1.        , -0.65736617,  0.90855432, ..., -0.26330178,
         0.83711369, -0.69029101],
       [ 1.        ,  0.28798799,  3.06153769, ..., -1.76983497,
        -0.75038339,  1.07998198],
       [ 1.        ,  1.23334216, -0.60516189, ...,  0.39580649,
        -0.88267481,  0.08420343]])

The Product of the Q and R matrices and the x_train matrix are the same, so the Factorization was done correctly. 

### Getting the Delta Matrix

The delta matrix is simply the diagonal matrix of the squared norms of the Q column vectors. 

In [31]:
# first create the vector, which will be the diagonal
delta_diagonal = []
for i in range(len(Q.T)):
    dot = np.dot(Q[:, i], Q[:, i])
    delta_diagonal.append(dot)

delta_diagonal = np.array(delta_diagonal)

# now make it into a diagonal matrix
delta = np.diag(delta_diagonal)

### Getting Delta Inverse

In [33]:
delta_inv = np.linalg.inv(delta)

### Getting R inverse

In [35]:
R_inv = np.linalg.inv(R)

### Solve for the weight vector

In [39]:
t1 = np.dot(Q.T, y_train)

In [40]:
t2 = np.dot(delta_inv, t1)

In [41]:
weights = np.dot(R_inv, t2)

In [42]:
weights

array([ 3.40671589e+03,  1.14983442e+02,  4.34197860e+01,  4.51148103e+02,
       -2.55123355e+02,  1.60751660e+02,  6.92285331e+01, -2.53209371e+02,
        2.09448153e+02,  4.33675185e+13,  3.65081402e+13,  4.65433152e+13,
        4.86598032e+13,  4.77000775e+13,  5.35003137e+02, -3.20932785e+01,
       -1.55609941e+02,  3.59935403e+01, -2.44346667e+02, -1.76628520e+02,
        1.54116966e+00, -9.25278069e+01, -4.42302714e+01, -2.01410725e+02,
        6.67962243e+01, -1.05268772e+02, -2.30387594e+01,  4.45954720e+01,
        6.01780802e+01,  1.55689608e+02])

### Calculate the Y-hat values

In [60]:
y_pred = np.dot(x_test, weights)

### Calculate SSE

In [63]:
diff = y_test - y_pred
SSE = np.dot(diff, diff)

In [64]:
SSE

2.723291872261555e+28

### Calculate TSS

In [67]:
y_mu = np.mean(y_test)
diff2 = y_test - y_mu
TSS = np.dot(diff2, diff2)

In [68]:
TSS

277008716210.6517

# Calculate the R^2 metric

In [69]:
R2 = (TSS - SSE) / TSS

In [70]:
R2

-9.831069251231155e+16

# Part2: Linear Regression with Regularization