In [7]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from sklearn.model_selection import train_test_split

df = pd.read_csv('term_project_overlay_data.csv').drop('overlay_error_x', axis = 1, inplace = False)
OyOriginal = df[' overlay_error_y']
df.drop(' overlay_error_y', axis = 1, inplace = True)
dataOriginal = df.values
df.rename(columns={' x': 'x'}, inplace = True)
dfout = df
dfout['x2'] = df['x']*df['x']
dfout['y2'] = df['y'] * df['y']
dfout['xy'] = df['x'] * df['y']
dfout['X2'] = df['X'] * df['X']
dfout['Y2'] = df['Y'] * df['Y']
dfout['XY'] = df['X'] * df['Y']
dfout['0'] = 1
dfout = dfout[['0', 'X', 'Y', 'XY', 'X2', 'Y2', 'x', 'y', 'xy', 'x2', 'y2']]

## Ordinary Least Square Method with Training and Test Set Split

In [8]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(dfout, OyOriginal,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('linear model intercept: {}'
     .format(linreg.intercept_))
print('linear model coeff:\n{}'
     .format(linreg.coef_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

linear model intercept: 0.7485286953336125
linear model coeff:
[ 0.00000000e+00  3.04014130e-03  3.97338613e-03  1.03852304e-05
  5.72920217e-06 -1.69748997e-05  1.60371908e-02  1.45566852e-02
  2.75898064e-04 -3.65549058e-03  2.72713620e-03]
R-squared score (training): 0.457
R-squared score (test): 0.141


## Ridge Regression

In [9]:
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(dfout, OyOriginal,
                                                   random_state = 0)

linridge = Ridge(alpha=20.0).fit(X_train, y_train)

print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))

ridge regression linear model intercept: 0.7484596024496036
ridge regression linear model coeff:
[ 0.00000000e+00  3.03919844e-03  3.97283637e-03  1.03837731e-05
  5.73346762e-06 -1.69738261e-05  1.59955609e-02  1.45452115e-02
  2.76051384e-04 -3.65446276e-03  2.72715717e-03]
R-squared score (training): 0.457
R-squared score (test): 0.141
Number of non-zero features: 10


In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(dfout, OyOriginal,
                                                   random_state = 0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

print('Scaled data:')
print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))

Scaled data:
ridge regression linear model intercept: 0.24083232615128025
ridge regression linear model coeff:
[ 0.          0.33502682  0.4021618   0.06921812  0.07011548 -0.1146341
  0.11788948  0.17023391  0.02507628 -0.1905958   0.48167654]
R-squared score (training): 0.333
R-squared score (test): 0.196
Number of non-zero features: 10


In [13]:
print('Ridge regression: effect of alpha regularization parameter\n')
for this_alpha in range(0, 50, 5):
    linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
    r2_train = linridge.score(X_train_scaled, y_train)
    r2_test = linridge.score(X_test_scaled, y_test)
    num_coeff_bigger = np.sum(abs(linridge.coef_) > 0)
    print('Alpha = {:.2f}\nnum abs(coeff) > 0: {}, \
r-squared training: {:.2f}, r-squared test: {:.2f}'
         .format(this_alpha, num_coeff_bigger, r2_train, r2_test))


Ridge regression: effect of alpha regularization parameter

Alpha = 0.00
num abs(coeff) > 0: 10, r-squared training: 0.46, r-squared test: 0.14
Alpha = 5.00
num abs(coeff) > 0: 10, r-squared training: 0.43, r-squared test: 0.20
Alpha = 10.00
num abs(coeff) > 0: 10, r-squared training: 0.40, r-squared test: 0.21
Alpha = 15.00
num abs(coeff) > 0: 10, r-squared training: 0.36, r-squared test: 0.20
Alpha = 20.00
num abs(coeff) > 0: 10, r-squared training: 0.33, r-squared test: 0.20
Alpha = 25.00
num abs(coeff) > 0: 10, r-squared training: 0.31, r-squared test: 0.19
Alpha = 30.00
num abs(coeff) > 0: 10, r-squared training: 0.29, r-squared test: 0.18
Alpha = 35.00
num abs(coeff) > 0: 10, r-squared training: 0.27, r-squared test: 0.17
Alpha = 40.00
num abs(coeff) > 0: 10, r-squared training: 0.25, r-squared test: 0.16
Alpha = 45.00
num abs(coeff) > 0: 10, r-squared training: 0.24, r-squared test: 0.15


## Lasso Regression

In [14]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(dfout, OyOriginal,
                                                   random_state = 0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linlasso = Lasso(alpha=2.0).fit(X_train_scaled, y_train)

print('lasso regression linear model intercept: {}'
     .format(linlasso.intercept_))
print('lasso regression linear model coeff:\n{}'
     .format(linlasso.coef_))
print('Non-zero features: {}'
     .format(np.sum(linlasso.coef_ != 0)))
print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}\n'
     .format(linlasso.score(X_test_scaled, y_test)))
print('Features with non-zero weight (sorted by absolute magnitude):')

for e in sorted (list(zip(list(dfout), linlasso.coef_)),
                key = lambda e: -abs(e[1])):
    if e[1] != 0:
        print('\t{}, {:.3f}'.format(e[0], e[1]))

lasso regression linear model intercept: 0.9265930000000001
lasso regression linear model coeff:
[ 0.  0.  0.  0.  0. -0.  0.  0.  0. -0.  0.]
Non-zero features: 0
R-squared score (training): 0.000
R-squared score (test): -0.013

Features with non-zero weight (sorted by absolute magnitude):


In [15]:
print('Lasso regression: effect of alpha regularization\n\
parameter on number of features kept in final model\n')

for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:
    linlasso = Lasso(alpha).fit(X_train_scaled, y_train)
    r2_train = linlasso.score(X_train_scaled, y_train)
    r2_test = linlasso.score(X_test_scaled, y_test)
    
    print('Alpha = {:.2f}\nFeatures kept: {}, r-squared training: {:.2f}, \
r-squared test: {:.2f}'
         .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))

Lasso regression: effect of alpha regularization
parameter on number of features kept in final model

Alpha = 0.50
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
Alpha = 1.00
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
Alpha = 2.00
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
Alpha = 3.00
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
Alpha = 5.00
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
Alpha = 10.00
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
Alpha = 20.00
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
Alpha = 50.00
Features kept: 0, r-squared training: 0.00, r-squared test: -0.01
