# Simple Linear versus Ridge Regression 

## Step 1:  Getting, understanding, and preprocessing the dataset

We first import the standard libaries and some libraries that will help us scale the data and perform some "feature engineering" by transforming the data into $\Phi_2({\bf x})$

In [183]:
import numpy as np
import sklearn
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.linear_model
from sklearn.model_selection import KFold

###  Importing the dataset

In [184]:
# Import the boston dataset from sklearn
# Load dataset to some variable 
boston_data = load_boston()
# print(boston_data)
# boston_data.DESCR

In [185]:
#  Create X and Y variables - X holding the .data and Y holding .target 
X = boston_data.data
y = boston_data.target
#  Reshape Y to be a rank 2 matrix using y.reshape()
# print(y.shape)
# print(X.shape)
# y.reshape(253,2)
y.reshape(506,1)
# print("Rank", np.linalg.matrix_rank(y))

# Observe the number of features and the number of labels
print('The number of features is: ', X.shape[1])
# Printing out the features
print('The features: ', boston_data.feature_names)
# The number of examples
print('The number of exampels in our dataset: ', X.shape[0])
# Observing the first 2 rows of the data
print(X[0:2])
print(y[0:2])


The number of features is:  13
The features:  ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
The number of exampels in our dataset:  506
[[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
  6.5200e+01 4.0900e+00 1.0000e+00 2.9600e+02 1.5300e+01 3.9690e+02
  4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
  7.8900e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9690e+02
  9.1400e+00]]
[24.  21.6]


We will also create polynomial feeatures for the dataset to test linear and ridge regression on data with d = 1 and data with d = 2. Feel free to increase the # of degress and see what effect it has on the training and test error. 

In [186]:
# Create a PolynomialFeatures object with degree = 2. Using PolynomialFeatures(degree=2)
poly = PolynomialFeatures(2)
# Transform X and save it into X_2 using poly.fit_transform(X)
# Simply copy Y into Y_2 

X_2 = poly.fit_transform(X)
y_2 = y
# print(X_2)
# print("======================================")
# print(y_2)

In [187]:
# the shape of X_2 and Y_2 - should be (506, 105) and (506, 1) respectively
print(X_2.shape)
print(y_2.shape)

(506, 105)
(506,)


# Your code goes here

In [188]:
# Define the get_coeff_ridge_normaleq function. Use the normal equation method.
# Return w values

def get_coeff_ridge_normaleq(X_train, y_train, alpha):
  i,j = X_train.shape
  I = np.identity(j)
  X_train_trans_dot_X_train = np.dot(np.transpose(X_train),X_train)
  alpha_times_I = alpha*I
  X_train_trans_dot_y_train = np.dot(np.transpose(X_train),y_train)
  M = X_train_trans_dot_X_train + alpha_times_I
  M_Inverse = np.linalg.pinv(M)

  w = np.dot(M_Inverse, X_train_trans_dot_y_train)
  return w



In [189]:
# Define the get_coeff_ridge_normaleq function. Use the normal equation method.
# Return w values

def get_coeff_linear_normaleq(X_train, y_train):
  X_train_trans_dot_X_train = np.dot(np.transpose(X_train),X_train)
  X_train_trans_dot_y_train = np.dot(np.transpose(X_train),y_train)
  M = X_train_trans_dot_X_train
  M_Inverse = np.linalg.pinv(M)

  w = np.dot(M_Inverse, X_train_trans_dot_y_train)
  return w
  

In [190]:
# Define the evaluate_err_ridge function.
# Return the train_error and test_error values
def evaluate_err(X_train, X_test, y_train, y_test, w): 
    # pred_train=prediction using w and X_train+np.mean(y_train)
    pred_train= np.dot(X_train, w)
   
#     pred_test=prediction using w and X_test
    pred_test= np.dot(X_test, w)

#     remember to add the mean back
    # M = (y_train - pred_train)**2
    M = np.square(y_train - pred_train)

    train_error= np.mean(M)
    N = np.square(y_test - pred_test)
    test_error= np.mean(N)
    
    return train_error, test_error

In [191]:
# Finish writting the k_fold_cross_validation function. 
# Returns the average training error and average test error from the k-fold cross validation
# Sklearns K-Folds cross-validator: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
# train_error_list = []
# test_error_list =[]
def k_fold_cross_validation_for_RR(k, X, y, alpha):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    total_E_val_test = 0
    total_E_val_train = 0
    train_error_list = []
    test_error_list =[]

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Centering the data so we do not need the intercept term (we could have also chose w_0=average y value)
        # Subtract y_train_mean from y_train and y_test
        y_train_mean = np.mean(y_train)
        y_train = y_train - y_train_mean
        y_test = y_test - y_train_mean
        
        # Scaling the data matrix
        scaler= sklearn.preprocessing.StandardScaler().fit(X_train, y_train)

        # And scaler.transform(...)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        
        # Determine the training error and the test error
        # Use get_coeff_linear_normaleq or get_coeff_ridge_normaleq to get w

        # w = get_coeff_linear_normaleq(X_train, y_train)
        w = get_coeff_ridge_normaleq(X_train, y_train, alpha)
        # And use evaluate_err()
        total_E_val_test, total_E_val_train = evaluate_err(X_train, X_test, y_train, y_test, w)
        # print("---------------------- ", alpha,"----------------------")
        # print("Test Error ", total_E_val_test, "Train Error  ", total_E_val_train)

        train_error_list.append(total_E_val_train)
        test_error_list.append(total_E_val_test)
        




#        ##############
    print("Average train error: ", np.mean(train_error_list))
    print("Average test error : ", np.mean(test_error_list))
    return  total_E_val_test, total_E_val_train
    


In [192]:
def k_fold_cross_validation_for_LR(k, X, y):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    total_E_val_test = 0
    total_E_val_train = 0
    train_error_list = []
    test_error_list =[]

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Centering the data so we do not need the intercept term (we could have also chose w_0=average y value)
        # Subtract y_train_mean from y_train and y_test
        y_train_mean = np.mean(y_train)
        y_train = y_train - y_train_mean
        y_test = y_test - y_train_mean
        
        # Scaling the data matrix
        scaler= sklearn.preprocessing.StandardScaler().fit(X_train, y_train)

        # And scaler.transform(...)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        
        # Determine the training error and the test error
        # Use get_coeff_linear_normaleq or get_coeff_ridge_normaleq to get w

        w = get_coeff_linear_normaleq(X_train, y_train)
        # And use evaluate_err()
        total_E_val_test, total_E_val_train = evaluate_err(X_train, X_test, y_train, y_test, w)
        # print("---------------------- ", alpha,"----------------------")
        # print("Test Error ", total_E_val_test, "Train Error  ", total_E_val_train)

        train_error_list.append(total_E_val_train)
        test_error_list.append(total_E_val_test)
        




#        ##############
    print("Average train error: ", np.mean(train_error_list))
    print("Average test error : ", np.mean(test_error_list))
    return  total_E_val_test, total_E_val_train

In [193]:
# print the error for the both linear regression and ridge regression

# the error should include both training error and testing error

print("----------------------------------------------------------------------------------------------------")
print("\n")
print("============== Error for Ridge Regression  ==========================")
print("\n")


test, train = k_fold_cross_validation_for_RR(13, X, y, 10)
print("Test Error ", test, "Train Error  ", train)

print("\n")

print("\n\n")
print("================== Using PolynomialFeatures(degree=2) ======================")
print("\n")

test, train = k_fold_cross_validation_for_RR(13, X_2, y_2, 10)
print("Test Error ", test, "Train Error  ", train)

print("\n")



print("----------------------------------------------------------------------------------------------------")
print("\n")







----------------------------------------------------------------------------------------------------




Average train error:  23.744830717361012
Average test error :  21.909107606951896
Test Error  20.257047684784677 Train Error   44.37520576905882







Average train error:  13.421591299161484
Average test error :  10.044478093371858
Test Error  9.203691183870086 Train Error   29.748698809980397


----------------------------------------------------------------------------------------------------




In [194]:
print("----------------------------------------------------------------------------------------------------")
print("\n")
print("============== Error for Linear Regression  ==========================")
print("\n")


test, train = k_fold_cross_validation_for_LR(13, X, y)
print("Test Error ", test, "Train Error  ", train)

print("\n")

print("\n\n")
print("================== Using PolynomialFeatures(degree=2) ======================")
print("\n")

test, train = k_fold_cross_validation_for_LR(13, X_2, y_2)
print("Test Error ", test, "Train Error  ", train)

print("\n")



print("----------------------------------------------------------------------------------------------------")
print("\n")

----------------------------------------------------------------------------------------------------




Average train error:  23.71387887738641
Average test error :  21.82585480837458
Test Error  20.17690678000032 Train Error   43.710386884038705







Average train error:  12.3890617366037
Average test error :  5.844996839595582
Test Error  5.557537678823688 Train Error   17.9763604203099


----------------------------------------------------------------------------------------------------




In [195]:
# use the model to predict the new test case.
print("----------------------------------------------------------------------------------------------------")
print("\n")
print("===================== Error for Ridge Regression ===============================")
print("\n\n")

for i in np.logspace(1,7,num=13):
  print("---------------------- ", i,"----------------------")
  
  test, train = k_fold_cross_validation_for_RR(13, X, y, i)
  print("Test Error ", test, "Train Error  ", train)
  print("\n")
# print("Average train error: ", np.mean(train_error_list))
# print("Average test error : ", np.mean(test_error_list))


# =============================================================
print("\n\n")
print("================== Using PolynomialFeatures(degree=2) ======================")
print("\n\n")

for i in np.logspace(1,7,num=13):
  print("---------------------- ", i,"----------------------")
  
  test, train = k_fold_cross_validation_for_RR(13, X_2, y_2, i)
  print("Test Error ", test, "Train Error  ", train)
  print("\n")

# print("Average train error: ", np.mean(train_error_list))
# print("Average test error : ", np.mean(test_error_list))
print("----------------------------------------------------------------------------------------------------")
print("\n")


----------------------------------------------------------------------------------------------------





----------------------  10.0 ----------------------
Average train error:  23.744830717361012
Average test error :  21.909107606951896
Test Error  20.257047684784677 Train Error   44.37520576905882


----------------------  31.622776601683793 ----------------------
Average train error:  24.032539044297817
Average test error :  22.28971802780797
Test Error  20.617757706422093 Train Error   45.70334308336826


----------------------  100.0 ----------------------
Average train error:  25.21802757161072
Average test error :  23.693429811532578
Test Error  21.94463034147355 Train Error   48.90915092711351


----------------------  316.22776601683796 ----------------------
Average train error:  29.21717426911817
Average test error :  28.034013497430863
Test Error  26.080609596701102 Train Error   56.170224587084654


----------------------  1000.0 ----------------------
Average train erro