In [1]:
## Reference
# https://mubaris.com/posts/linear-regression/
# https://towardsdatascience.com/gradient-descent-in-python-a0d07285742f
# β̂ ridgeλ=(XTX+λIp)−1XTy

In [2]:
### DATA DESCRIPTION

#     1. CRIM      per capita crime rate by town
#     2. ZN        proportion of residential land zoned for lots over 
#                  25,000 sq.ft.
#     3. INDUS     proportion of non-retail business acres per town
#     4. CHAS      Charles River dummy variable (= 1 if tract bounds 
#                  river; 0 otherwise)
#     5. NOX       nitric oxides concentration (parts per 10 million)
#     6. RM        average number of rooms per dwelling
#     7. AGE       proportion of owner-occupied units built prior to 1940
#     8. DIS       weighted distances to five Boston employment centres
#     9. RAD       index of accessibility to radial highways
#     10. TAX      full-value property-tax rate per $10,000
#     11. PTRATIO  pupil-teacher ratio by town
#     12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
#                  by town
#     13. LSTAT    % lower status of the population
#     14. MEDV     Median value of owner-occupied homes in $1000's

In [3]:
## LIBRARIES

import numpy as np
import pandas as pd
import math
from numpy.linalg import inv

In [58]:
## LOAD DATA
train = pd.read_csv('housing_train.txt', sep = "\s+",header=None)
#data.columns = ["a", "b", "c", "etc."]

#split into features and labels
train_x = train.iloc[:,0:13]
train_y = train.iloc[:,13]

x0 = np.ones(len(train_x))
#train_x['b'] = pd.Series(x0)
train_x = pd.concat([pd.Series(x0), train_x ], axis=1)

test = pd.read_csv('housing_test.txt', sep = "\s+",header=None)

#split into features and labels
test_x = test.iloc[:,0:13]
test_y = test.iloc[:,13]


#x0 = np.ones(len(test_x))
#test_x['b'] = pd.Series(x0)
test_x = pd.concat([pd.Series(x0), test_x ], axis=1)


In [64]:
# Feature - Target
Y = np.array(train_y, dtype=np.float64)
X = np.array(train_x, dtype=np.float64)


In [205]:
len(X[1])

14

In [227]:
#GRADIENT DESCENT
#batch
alpha = .00000001 #Step size
iterations = 900000 #No. of iterations

#stocastic
# alpha = .00000000001 #Step size
# iterations = 12000 #No. of iterations

m = len(Y) #No. of data points
np.random.seed(123) #Set the seed

#weights = np.random.rand(14) #Pick some random values to start with
weights = np.zeros(14)

#GRADIENT DESCENT
def gradient_descent(x, y, weights, iterations, alpha):
    past_costs = []
    
    past_thetas = [weights]
    for i in range(iterations):
        
        #STOCHASTIC
#         for i in range(len(x)):
#             prediction = x[i].dot(weights)
#             error = prediction - y[i]
#             cost = np.dot(error.T, error)
#             #past_costs.append(cost)
#             weights = (alpha * np.dot(x[i].T, error))
#             #past_thetas.append(weights)
#         past_thetas.append(weights)
#         past_costs.append(cost)
        
        #BATCH
        prediction = x.dot(weights)
        error = prediction - y
        cost = np.dot(error.T, error)
        past_costs.append(cost)

        weights = weights - (alpha * np.dot(x.T, error))
        past_thetas.append(weights)

        
    return past_thetas, past_costs

In [228]:
t,c = gradient_descent(X,Y,weights, iterations, alpha)

In [229]:
c[-1]

10791.637053391918

In [230]:
t[-1] - t[-2]

array([ 3.43147359e-07, -4.32906742e-09, -1.74579645e-08, -9.19935921e-09,
        6.93525370e-07,  4.84677997e-08,  1.22472330e-06, -2.68654026e-08,
       -1.79230047e-07,  4.41916093e-08, -1.84340401e-09, -2.91179699e-07,
       -2.36301542e-09,  1.02435882e-07])

In [231]:
# Model Evaluation - RMSE
def mse(Y, Y_pred):
    mse = 0
    for i in range(len(Y)):
        mse += ((Y_pred[i]-Y[i]) ** 2) / len(Y)
    return(mse)

# Model Evaluation - R2 Score
def r2_score(Y, Y_pred):
    mean_y = np.mean(Y)
    ss_tot = sum((Y - mean_y) ** 2)
    ss_res = sum((Y - Y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

Y_pred = X.dot(t[-1])

print("Train MSE: " +str(mse(Y, Y_pred)))
#print(r2_score(Y, Y_pred))

Train MSE: 24.9229482842699


In [232]:
Y_test = np.array(test_y)
X_test = np.array(test_x)
test_pred = X_test.dot(t[-1])
print("Test MSE: " + str(mse(Y_test, test_pred)))

Test MSE: 23.388310069728696
