In [1]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt 
import csv
from time import perf_counter 

In [2]:
name_cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
raw_df = pd.read_csv("housing.data", names = name_cols, header=None, delim_whitespace = True)
raw_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [3]:
# print(raw_df.isnull().sum() ) # check for null values
row_num = raw_df.shape[0]
col_num = raw_df.shape[1]
# print(f'Number of rows/datapoints: {row_num}')
# print(f'Number of columns/features: {col_num}')

In [4]:
df = pd.DataFrame(raw_df).to_numpy() # convert from dataframe format to numpy format
print(f'last row last column: {df[505][-1]}, type: {type(df[1][-1])}')

last row last column: 11.9, type: <class 'numpy.float64'>


In [5]:
x_features_only = df[:, :-1]  # all features
# print('x_input shape', x_features_only.shape)
# print(x_features_only[-1])
y_target = df[:, -1]  # only y label
print(y_target.shape)
print(y_target[-10:-1])

(506,)
[19.7 18.3 21.2 17.5 16.8 22.4 20.6 23.9 22. ]


In [6]:
def dataNorm(X):
    '''
    input: X is a matrix of x-features and y-label
    output: x-features only all in proper columns
    '''

    xMerged = X[:,-1].T #label col. switch it's column to a row for vstack later   
    f_transpose = X.T #feature cols. switch the columns to rows for iteration later
#     print('xmerged shape:', xMerged.shape)
#     print('f_transpose shape:', f_transpose.shape)
    for i in f_transpose:  
#         print('i shape', i.shape)
        arr_transpose = (i - np.min(i)) / np.ptp(i)
        xMerged = np.vstack((xMerged, arr_transpose)) # merging output and features row-wise
#     print('xMerged shape:', xMerged.shape)    
#     y_output = xMerged[0] # a row of 'Rings' data points
#     final_merged = np.vstack((xMerged[1:], y_label)) # vstack the 8 features and  output at the bottom of the stack 
    final_merged = xMerged[1:] # remove output row at the top of stack
    return final_merged.T # transpose to make features col-wise again
    
x_features_only = dataNorm(x_features_only)
print(x_features_only.shape)
print(x_features_only)

(506, 13)
[[0.00000000e+00 1.80000000e-01 6.78152493e-02 ... 2.87234043e-01
  1.00000000e+00 8.96799117e-02]
 [2.35922539e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01
  1.00000000e+00 2.04470199e-01]
 [2.35697744e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01
  9.89737254e-01 6.34657837e-02]
 ...
 [6.11892474e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
  1.00000000e+00 1.07891832e-01]
 [1.16072990e-03 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
  9.91300620e-01 1.31070640e-01]
 [4.61841693e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
  1.00000000e+00 1.69701987e-01]]


In [7]:
# test unit for joining the x_input and y_target back together

print(y_target)
y_response = y_target.reshape((506,1)) # gotta reshape for concatenate purpose
print(y_response.shape)
x_features_y_response = np.concatenate((x_features_only, y_response), axis=1) # normalised features + y_label
print(x_features_y_response[:, -1])
print(x_features_y_response.shape)

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

In [8]:
# concat an all-ones col to x_input features (for y-intercept computation)
# x_in is for gradient descent computation

########## ensure x_features don't normalize here. take from raw_df here!!! confirmed on 14 feb ##########

x_features_ones = np.concatenate([x_features_only, np.ones([np.shape(x_features_only)[0], 1])], axis=1)
print(x_features_ones)
print(x_features_ones.shape)

[[0.00000000e+00 1.80000000e-01 6.78152493e-02 ... 1.00000000e+00
  8.96799117e-02 1.00000000e+00]
 [2.35922539e-04 0.00000000e+00 2.42302053e-01 ... 1.00000000e+00
  2.04470199e-01 1.00000000e+00]
 [2.35697744e-04 0.00000000e+00 2.42302053e-01 ... 9.89737254e-01
  6.34657837e-02 1.00000000e+00]
 ...
 [6.11892474e-04 0.00000000e+00 4.20454545e-01 ... 1.00000000e+00
  1.07891832e-01 1.00000000e+00]
 [1.16072990e-03 0.00000000e+00 4.20454545e-01 ... 9.91300620e-01
  1.31070640e-01 1.00000000e+00]
 [4.61841693e-04 0.00000000e+00 4.20454545e-01 ... 1.00000000e+00
  1.69701987e-01 1.00000000e+00]]
(506, 14)


In [9]:
# concat an all-ones col to x_input features (for y-intercept computation)
# x_in is for gradient descent computation
import pprint as pp

########## ensure x_features don't normalize here. take from raw_df here!!! confirmed on 14 feb ##########

x_features_ones_ylabel = np.concatenate([x_features_only, np.ones([np.shape(x_features_only)[0], 1]), y_response], axis=1)
pp.pprint(x_features_ones_ylabel)
print(x_features_ones_ylabel.shape)

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, ...,
        8.96799117e-02, 1.00000000e+00, 2.40000000e+01],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, ...,
        2.04470199e-01, 1.00000000e+00, 2.16000000e+01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, ...,
        6.34657837e-02, 1.00000000e+00, 3.47000000e+01],
       ...,
       [6.11892474e-04, 0.00000000e+00, 4.20454545e-01, ...,
        1.07891832e-01, 1.00000000e+00, 2.39000000e+01],
       [1.16072990e-03, 0.00000000e+00, 4.20454545e-01, ...,
        1.31070640e-01, 1.00000000e+00, 2.20000000e+01],
       [4.61841693e-04, 0.00000000e+00, 4.20454545e-01, ...,
        1.69701987e-01, 1.00000000e+00, 1.19000000e+01]])
(506, 15)


In [10]:
#unit test
y_out = x_features_ones_ylabel[:, -1]
y_out.shape

(506,)

In [11]:
# In real life we don't want to code it directly
np.linalg.lstsq(x_features_ones, y_target, rcond=None)[0] # the last in the output is the y-intercept

array([ -9.60975755,   4.64204584,   0.56083933,   2.68673382,
        -8.63457306,  19.88368651,   0.06721501, -16.22666104,
         7.03913802,  -6.46332721,  -8.95582398,   3.69282735,
       -19.01724361,  26.62026758])

In [12]:
# Part 5
def gradient_func(weights, X, y_target):  # Vectorized gradient function
    '''
    Given `weights` - a current "Guess" of what our weights should be
          `X` - matrix of shape (N,D) of input features
          `y_target` - target y values
    Return gradient of each weight evaluated at the current value
    '''
    N, D = np.shape(X)
    y_pred = np.dot(X, weights)  # alternative, use np.matmul()
    error = np.subtract(y_pred, y_target)
    return y_pred, error  # return the gradient of the cost function

In [13]:
def predict(x_test, y_test, w):
# def rmse_func(X, y_target, alpha):
    '''
    Given `X` - matrix of shape (N,D) of input features
          `t` - target y values
    Solves for linear regression weights.
    Return weights after `niter` iterations.
    '''
#     N, D = np.shape(X)                                  # feature matrix has N rows and D cols
#     w = np.zeros([D])                                   # initialize all the weights to zeros based on N cols of feature matrix
    y_pred, error = gradient_func(w, x_test, y_test)   # call the gradient function. get y_pred, error output
    print('y_pred shape:', y_pred.shape)
#     print('error:', error)
    
    rmse = np.sqrt(np.square(np.subtract(y_test,y_pred)).mean()) 
    print('rmse:', rmse)
    return y_pred, y_test, rmse  # return the gradient of the cost function

In [14]:
# Part 5
def gradient_descent(X, y_target, alpha, print_every=5000, niter=100000):  # gotta varies the alpha to get the most accurate w
    '''
    Given `X` - matrix of shape (N,D) of input features
          `t` - target y values
    Solves for linear regression weights.
    Return weights after `niter` iterations.
    '''
    N, D = np.shape(X)                                  # feature matrix has N rows and D cols
    w = np.zeros([D])                                   # initialize all the weights to zeros based on N cols of feature matrix
    for k in range(niter):   # loop over niter counts
       
        y_pred, error = gradient_func(w, X, y_target)        # call the gradient function. get y_pred, error output
        dw = np.dot(np.transpose(X), error) / float(N)
        # -------------------------------------------------------------------------------
        prev = w                           # assign the previous weight to prev variable
        w = w - alpha * dw                 # update the weight with the learning rate and gradient change 
        new = w                            # update the new weight to new variable
        # ------------------------------------------------------------------------------        
#         if k % print_every == 0:           # for every 5000 count
#             if np.all(new-prev) == False:  # when there is no improvement over the previous w, then get the latest optimal value
#                 print(f"Learning rate (alpha) is: {str(alpha)}")
#                 print(f'Weight after {k} iteration:\n {str(w)}')
#                 print()
#                 break                 
#             elif k == 95000:
#                 print()
    return w

In [15]:
# Part 6
def splitCV(X_norm, K): # Split a dataset into k folds
    dataset_split = []
    np.random.shuffle(X_norm) # shuffles the rows in the X_norm matrix
    fold_size = int(len(X_norm) / K) # compute the num of rows per fold
    row_num = X_norm.shape[0]

    for i in range(K):
        if i == K-1:
            fold = np.array(X_norm)
            dataset_split.append(X_norm)
        else:
            dataset_split.append(X_norm[:fold_size])
            X_norm = X_norm[fold_size:]       
    return dataset_split

In [16]:
#Part 6
def CV_Main(x_features_ones_ylabel, cv_num): # k = number of neighbors
    cv_list = []
    X_cv = splitCV(x_features_ones_ylabel, cv_num) # split the data set into K folds = number of parts. X_cv is a list of folds
    print('\nCV_computation ongoing ... ')
    for idx, list_array in enumerate(X_cv): # looping the dataset for cross validation 
        duplicate = X_cv.copy()
        test = list_array
        del duplicate[idx]  # delete the test element from duplicate set, remaining become train elements
        train = duplicate   # remaining elements in duplicate become train set
        train = np.vstack((train)) # convert train stack up vertically
        cv_list.append(np.array([test, train])) #append test and train into a list before return
    return cv_list  # cv_list is a list type containing 2 elements - test and train

In [23]:
# def main(): 
## PART 6 and 7    
# MAIN CALL BLOCK for CROSS VALIDATION over 5, 10, 15
cv5_ypred = []   # stores 5 elements of y_pred.
cv10_ypred = []  # stores 10 elements of y_pred.
cv15_ypred = []  # stores 15 elements of y_pred.
cv5_yactual = []   # stores 5 elements of y_actual.
cv10_yactual = []  # stores 10 elements of y_actual.
cv15_yactual = []  # stores 15 elements of y_actual.
cv5_rmse = []    # stores 5 rmse values
cv10_rmse = []   # stores 10 rmse values
cv15_rmse = []   # stores 15 rmse values

for cv in [5, 10, 15]:  # Looping over the cv numbers

    t1_start = perf_counter() # Start the stopwatch / counter 
    cv_list = CV_Main(x_features_ones_ylabel, cv)    
    print(f"-------- CV {cv} --------")
    
    for num in cv_list:  # for each fold in a list of k folds

        test = num[0]            # grab the test set from the fold
        x_test = test[:, :-1]    # grab the features from the test set
        print('x_test shape:', x_test.shape)
        ########## gotta normalize x_test here!!! confirmed on 14 feb ##########
        
        
        y_test = test[:, -1]     # grab the label from the test set
        print('y_test shape:', y_test.shape)
        train = num[1]           # grab the train set from the fold
        x_train = train[:, :-1]  # grab the features from the train set
        print('x_train shape:', x_train.shape)
        ########## gotta normalize x_train here!!! confirmed on 14 feb ##########
        
        y_train = train[:, -1]   # grab the label from the train set
        print('y_train shape:', y_train.shape)
        w = gradient_descent(x_train, y_train, alpha=0.5)  # get the fitted weights from x, y train sets 
        print('w:', w)
        y_pred, y_actual, rmse = predict(x_train, y_train, w)          # apply the w onto the x, y test sets to yield y_pred 
#         print('type of y_pred:', type(y_pred))
#         print('type of y_actual:', type(y_actual))
#         print('type of rmse:', type(rmse))
#         print('rmse:', rmse)
        print()
        if cv == 5:
            cv5_ypred.append(y_pred)
            cv5_yactual.append(y_actual)
            cv5_rmse.append(rmse)
        elif cv == 10:
            cv10_ypred.append(y_pred)
            cv10_yactual.append(y_actual)
            cv10_rmse.append(rmse)
        elif cv == 15:
            cv15_ypred.append(y_pred)
            cv15_yactual.append(y_actual)
            cv15_rmse.append(rmse)

    t1_stop = perf_counter() # Stop the stopwatch / counter 
    print(f'\nElapsed time {t1_stop-t1_start} secs\n') 
print()
print('---- Run completed ----')    
# -------------------------------------------------
with open('cv5_ypred.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(val for val in cv5_ypred) 
    
with open('cv5_yactual.csv', 'w') as f: 
    write = csv.writer(f)       
    write.writerows(val for val in cv5_yactual) 
    
with open('cv5_rmse.csv', 'w', newline='') as f: 
    write = csv.writer(f) 
    write.writerow(val for val in cv5_rmse)
    
with open('cv5_train.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(list(val) for val in train)
    
with open('cv5_test.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(list(val) for val in test) 
#------------------------------------------------    
with open('cv10_ypred.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(val for val in cv10_ypred) 
    
with open('cv10_yactual.csv', 'w') as f: 
    write = csv.writer(f)       
    write.writerows(val for val in cv10_yactual) 
    
with open('cv10_rmse.csv', 'w', newline='') as f: 
    write = csv.writer(f) 
    write.writerow(val for val in cv10_rmse)
    
with open('cv10_train.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(list(val) for val in train)
    
with open('cv10_test.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(list(val) for val in test)
#------------------------------------------------
with open('cv15_ypred.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(val for val in cv15_ypred) 
    
with open('cv15_yactual.csv', 'w') as f: 
    write = csv.writer(f)       
    write.writerows(val for val in cv15_yactual) 
    
with open('cv15_rmse.csv', 'w', newline='') as f: 
    write = csv.writer(f) 
    write.writerow(val for val in cv15_rmse)
    
with open('cv15_train.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(list(val) for val in train)
    
with open('cv15_test.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(list(val) for val in test) 

# if __name__ == "__main__": 
#     main() 


CV_computation ongoing ... 
-------- CV 5 --------
x_test shape: (101, 14)
y_test shape: (101,)
x_train shape: (405, 14)
y_train shape: (405,)
w: [-10.908737     5.56008437   1.26272504   2.15850519  -7.18402937
  20.3109857   -0.62291034 -18.80404825   7.32182847  -7.75801062
  -9.14599108   4.50212421 -19.92692892  26.632875  ]
y_pred shape: (405,)
rmse: 4.568612364464523

x_test shape: (101, 14)
y_test shape: (101,)
x_train shape: (405, 14)
y_train shape: (405,)
w: [ -9.87056194   4.14708304   0.31371589   2.06714542  -9.5734851
  19.73737927   0.80027347 -16.2641563    7.71444146  -6.62715906
  -9.20925051   3.64009419 -19.44479209  27.07959957]
y_pred shape: (405,)
rmse: 4.89398818347444

x_test shape: (101, 14)
y_test shape: (101,)
x_train shape: (405, 14)
y_train shape: (405,)
w: [-10.10014418   3.6329536    0.70034721   3.76327754  -8.53775767
  20.30811797  -0.45275442 -16.21262241   6.36155914  -5.39882002
  -8.59850834   3.97606831 -18.85357814  26.07264045]
y_pred shape: (

KeyboardInterrupt: 

In [18]:
df = pd.read_csv('cv15_train.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.000175,0.6,0.090543,0.0,0.032922,0.620617,0.072091,0.462858,0.000000,0.148855,0.319149,0.991099,0.091060,1.0,31.1
1,0.000676,0.0,0.131598,0.0,0.257202,0.632113,0.736354,0.162382,0.173913,0.208015,0.425532,0.985804,0.143212,1.0,29.9
2,0.124781,0.0,0.646628,0.0,0.582305,0.257712,1.000000,0.004056,1.000000,0.914122,0.808511,1.000000,0.911700,1.0,13.8
3,0.161036,0.0,0.646628,0.0,0.471193,0.511209,0.876416,0.074712,1.000000,0.914122,0.808511,0.965757,0.314018,1.0,21.4
4,0.015530,0.0,0.281525,0.0,0.314815,0.457751,0.814624,0.260110,0.130435,0.229008,0.893617,0.585708,0.716887,1.0,13.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,0.134256,0.0,0.646628,0.0,0.563786,0.392221,1.000000,0.014149,1.000000,0.914122,0.808511,0.836578,0.286976,1.0,27.9
458,0.003071,0.0,0.338343,0.0,0.411523,0.453152,0.408857,0.113859,0.217391,0.389313,0.702128,1.000000,0.327263,1.0,24.5
459,0.153668,0.0,0.646628,0.0,0.730453,0.454876,0.875386,0.062836,1.000000,0.914122,0.808511,0.173055,0.891004,1.0,8.4
460,0.003856,0.0,0.346041,0.0,0.327160,0.461966,0.760041,0.179405,0.130435,0.223282,0.617021,0.998336,0.227373,1.0,20.3


In [19]:
df.drop(df.columns[[-2]], axis=1, inplace=True)

In [20]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,14
0,0.000175,0.6,0.090543,0.0,0.032922,0.620617,0.072091,0.462858,0.000000,0.148855,0.319149,0.991099,0.091060,31.1
1,0.000676,0.0,0.131598,0.0,0.257202,0.632113,0.736354,0.162382,0.173913,0.208015,0.425532,0.985804,0.143212,29.9
2,0.124781,0.0,0.646628,0.0,0.582305,0.257712,1.000000,0.004056,1.000000,0.914122,0.808511,1.000000,0.911700,13.8
3,0.161036,0.0,0.646628,0.0,0.471193,0.511209,0.876416,0.074712,1.000000,0.914122,0.808511,0.965757,0.314018,21.4
4,0.015530,0.0,0.281525,0.0,0.314815,0.457751,0.814624,0.260110,0.130435,0.229008,0.893617,0.585708,0.716887,13.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,0.134256,0.0,0.646628,0.0,0.563786,0.392221,1.000000,0.014149,1.000000,0.914122,0.808511,0.836578,0.286976,27.9
458,0.003071,0.0,0.338343,0.0,0.411523,0.453152,0.408857,0.113859,0.217391,0.389313,0.702128,1.000000,0.327263,24.5
459,0.153668,0.0,0.646628,0.0,0.730453,0.454876,0.875386,0.062836,1.000000,0.914122,0.808511,0.173055,0.891004,8.4
460,0.003856,0.0,0.346041,0.0,0.327160,0.461966,0.760041,0.179405,0.130435,0.223282,0.617021,0.998336,0.227373,20.3


In [21]:
# def dataStandard(X):

#     xMerged = X[:,-1].T #output col. switch it's column to a row for vstack later   
#     f_transpose = X[:, :-1].T #feature cols. switch the columns to rows for iteration later
    
#     for i in f_transpose:  
#         arr_transpose = (i - np.average(i)) / np.std(i)
#         xMerged = np.vstack((xMerged, arr_transpose))
        
#     y_output = xMerged[0] # a row of 'Rings' data points
#     final_merged = np.vstack((xMerged[1:], y_output)) # vstack the 8 features and the output at the bottom of the stack 
#     return final_merged.T 
    
# X_norm = dataStandard(X)
# print(X_norm.shape)
# print(len(X_norm))
# X_norm[0:2]