# Importing Dependancies

In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

# Read in Dataset

In [2]:
# read data into dataframe
df = pd.read_csv("./OnlineNewsPopularity.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

# Preprocessing

### Split into data and target labels

In [3]:
y = df.iloc[:, 60]
x = df.iloc[:, 0:60]

In [4]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 60 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

### Drop Unused Features

In [5]:
# drop unused features
vals = [0, 1, 4, 5, 6]
for i in range(13, 39):
    vals.append(i)

x = x.drop(x.columns[vals], axis = 1)
x.head()

Unnamed: 0,n_tokens_title,n_tokens_content,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,LDA_00,LDA_01,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,4.0,2.0,1.0,0.0,4.680365,5.0,0.500331,0.378279,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,3.0,1.0,1.0,0.0,4.913725,4.0,0.799756,0.050047,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,3.0,1.0,1.0,0.0,4.393365,6.0,0.217792,0.033334,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,9.0,0.0,1.0,0.0,4.404896,7.0,0.028573,0.4193,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,19.0,19.0,20.0,0.0,4.682836,7.0,0.028633,0.028794,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


### Shuffle Data

In [6]:
np.random.seed(42)

In [7]:
x = x.to_numpy()
y = y.to_numpy()

In [8]:
arr = np.arange(len(df))
np.random.shuffle(arr)
x = x[arr]
y = y[arr]

### Split data

In [9]:
x_train = x[:31715]
y_train = y[:31715]
x_val = x[31715:35679]
y_val = y[31715:35679]
x_test = x[35679:39644]
y_test = y[35679:39644]

### Standardize Data

In [10]:
# first save the training mean and std
training_mean = np.mean(x_train, axis=0)
training_std = np.std(x_train, axis=0)

# first standardize the training data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

In [11]:
# standardize the test and validation data using training mean and std
x_test = (x_test - training_mean) / training_std
x_val = (x_val - training_mean) / training_std

# Augment data

In [12]:
# finally, augment data using a 1's column
ones_train = np.ones((len(x_train), 1))
ones_test = np.ones((len(x_test), 1))
ones_val = np.ones((len(x_val), 1))

In [13]:
x_train = np.hstack((ones_train, x_train))
x_test = np.hstack((ones_test, x_test))
x_val = np.hstack((ones_val, x_val))

# Part1: Linear Regression using QR Factorization

### First get the Q matrix

In [14]:
Q = []
cols = x_train.shape[1]
for i in range(cols):
    # get initial u
    u = x_train[:, i]
    # iteratively subtract projections 
    for j in range(i):
        proj = np.dot(x_train[:, i], Q[j]) / np.dot(Q[j], Q[j])
        prod = proj * Q[j]
        u = u - prod
    Q.append(u)


# turn it into a numpy object
Q = np.array(Q)
# transpose it so that the computed vectors are now column vectors
Q = Q.T

In [19]:
Q

array([[ 1.        ,  0.76066508,  0.99323414, ..., -0.380303  ,
         0.51625051,  0.10543767],
       [ 1.        , -0.65736617, -0.075949  , ..., -0.03853357,
         0.30884834, -0.03024183],
       [ 1.        , -0.65736617, -0.65535167, ...,  0.67256689,
         0.33261947, -0.18369264],
       ...,
       [ 1.        , -0.65736617,  0.92036704, ..., -0.12948242,
         0.24828613, -0.14947669],
       [ 1.        ,  0.28798799,  3.05636261, ..., -2.27291013,
        -0.50176845,  0.6460148 ],
       [ 1.        ,  1.23334216, -0.62732476, ...,  0.00744116,
        -0.68531192, -0.01831509]])

### Next, get the R matrix

We can start with a d+1 identity matrix, and iteratively fill out the upper diagonal with projections.

In [16]:
R = np.identity(cols+1)

for j in range(1, cols):
    for i in range(cols-1):
        p = np.dot(x_train[:, j], Q[:, i]) / np.dot(Q[:, i], Q[:, i])
        R[i][j] = p
