In [None]:
# Import Statements
import numpy as np
import csv
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler

### Import and setup data

In [None]:
data_orig = np.genfromtxt('OnlineNewsPopularity.csv', delimiter=',')

data = np.concatenate((data_orig[1:,2:4], data_orig[1:, 7:13], data_orig[1:, 39:61]), axis=1)

num_points = data.shape[0]
num_attributes = data.shape[1]
print("Number of attributes:", num_attributes)
print("Number of points:", num_points)

# Read in the titles
file = open('OnlineNewsPopularity.csv')
csvreader = csv.reader(file)
header = []
header = next(csvreader)
header = header[2:13] + header[39:]

Number of attributes: 30
Number of points: 39644


### Data Preprocessing

In [None]:
np.random.seed(42)
np.random.shuffle(data)
np.random.shuffle(data)
num_training, num_validation = 31715, 3964+31715


training = data[:num_training, :]
validation = data[num_training:num_validation, :]
test = data[num_validation:, :]
training_target, validation_target, test_target = training[:, -1], validation[:, -1], test[:, -1]

scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(training[:,:-1])
training = np.concatenate((np.ones((num_training,1), dtype=float),scaler.transform(training[:,:-1])), axis=1)

validation = np.concatenate((np.ones((num_validation-num_training,1), dtype=float), scaler.transform(validation[:,:-1])), axis=1)
test = np.concatenate((np.ones((num_points-num_validation,1), dtype=float), scaler.transform(test[:,:-1])), axis=1)

print("Training shape:", training.shape)
print("Validation shape:", validation.shape)
print("Test shape:", test.shape)


Training shape: (31715, 30)
Validation shape: (3964, 30)
Test shape: (3965, 30)


### Linear Regression via QR Decomposition

In [None]:
def QR_Factorization(data):
    Q = np.ones(data.shape, dtype=float)
    R = np.zeros((data.shape[1], data.shape[1]), dtype=float)
    R[0,0] = 1

    for i in range(1, Q.shape[1]):
        Q[:, i] = data[:, i]
        R[i, i] = 1
        for j in range(i):
            pji = np.dot(data[:,i].T, Q[:,j])/np.dot(Q[:,j].T, Q[:,j])
            R[j, i] = pji
            Q[:,i] -= np.dot(pji, Q[:,j])

    return Q, R


In [None]:
def back_sub(R, ortho_q_y):
    w = np.zeros((R.shape[0],))

    for i in range(R.shape[0]-1, -1, -1):
        w[i] = ortho_q_y[i]

        for j in range(i+1, R.shape[0]):
            w[i] -= w[j]*R[i,j]


    return w

In [None]:
def multiple_regression(training, Y):
    Q, R = QR_Factorization(training)

    ortho = np.matmul(Q.T, Q)
    ortho = 1/ortho
    ortho = np.where(ortho < 1e10, ortho, 0)
    ortho = np.where(ortho > -1e10, ortho, 0)

    ortho_q_y = np.matmul(np.matmul(ortho, Q.T), Y)

    w = back_sub(R, ortho_q_y)

    return w





In [None]:
def find_results(w, test, Y):
    Y_hat = np.matmul(test, w)

    epsilon = Y-Y_hat
    SSE = np.dot(epsilon.T, epsilon)
    TSS = np.sum((Y-np.mean(Y))**2)

    R2 = (TSS-SSE)/TSS

    return SSE, TSS, R2




In [None]:
w = multiple_regression(training, training_target)

SSE, TSS, R2 = find_results(w, test, test_target)

print("SSE is:", SSE)
print("TSS is:", TSS)
print("R-squared is:", R2)



SSE is: 342677374158.57104
TSS is: 350624971631.2273
R-squared is: 0.02266694649751074
