In [1]:
import numpy as np
import utils
from numpy import genfromtxt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error


In [2]:
# get the data and the labels from csv and check the shapes
X = genfromtxt("easier_data.csv", delimiter=',')
y = genfromtxt("label.csv", delimiter=',')

print(X.shape)
print(y.shape)

(300, 10)
(300,)


__Part 1 (Linear Regression)__

In [3]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# fitting the model
reg = LinearRegression().fit(X_train, y_train)
print(f'sklearn score: {reg.score(X_train, y_train)}')
print(f'sklearn coefficients (linear): {reg.coef_}\n')

y_pred_linear = reg.predict(X_test)
# print the first 20 test samples and true values
for i in range(20):
    print(f'Predicted: {y_pred_linear[i]}   Actual: {y_test[i]}')

# errors
print(f'\nModel score on test set: {reg.score(X_test, y_test)}')
print(f'Model error: {mean_squared_error(y_test, y_pred_linear)}')

sklearn score: 0.9991066310921242
sklearn coefficients (linear): [-0.0027997   1.37760394  0.00368597 -0.0017419   0.33217045  1.01779495
  1.38159824  1.01544405  0.33455298  0.35074258]

Predicted: 12.43049081556034   Actual: 12.63
Predicted: -8.543971099105637   Actual: -8.58
Predicted: 1.8065647174071948   Actual: 1.98
Predicted: -0.7486928677999032   Actual: -0.46
Predicted: -5.145471632596664   Actual: -4.87
Predicted: 6.764636193175818   Actual: 6.95
Predicted: -4.758245413814564   Actual: -4.65
Predicted: -10.963099656751453   Actual: -10.92
Predicted: 16.568048955455254   Actual: 16.61
Predicted: 3.222440576632188   Actual: 3.35
Predicted: 2.3750065580620436   Actual: 2.39
Predicted: -2.0929183237876394   Actual: -2.29
Predicted: 10.287537268757923   Actual: 10.31
Predicted: -2.352894625371654   Actual: -2.59
Predicted: -8.122736000095326   Actual: -8.23
Predicted: 2.2799343134813164   Actual: 2.49
Predicted: 2.556958671482221   Actual: 2.63
Predicted: 8.955758836572706   Actu

In [9]:
# with built linear regression
scaler = StandardScaler()
X_train_linear = scaler.fit_transform(X_train)
X_test_linear = scaler.transform(X_test)
print(X_train_linear.shape)

w_linear = utils.gradient_descent(X_train_linear, y_train, learning_rate=0.002, iters=20000)
print(f"output coefficients: {w_linear}\n")

y_pred_linear = np.matmul(X_test_linear, w_linear)
# print the first 20 predictions
for i in range(20):
    print(f'Predicted: {y_pred_linear[i]}   Actual: {y_test[i]}')

# print the error
print(f'\nmean squared error: {mean_squared_error(y_test, y_pred_linear)}')

(240, 10)
output coefficients: [-0.00769903  3.95904879  0.01059071 -0.00483846  0.98509411  2.95488417
  3.99198557  2.9985356   0.96788702  1.01914286]

Predicted: 11.568115815560814   Actual: 12.63
Predicted: -9.406346099111014   Actual: -8.58
Predicted: 0.9441897174059268   Actual: 1.98
Predicted: -1.611067867800137   Actual: -0.46
Predicted: -6.007846632605115   Actual: -4.87
Predicted: 5.902261193189823   Actual: 6.95
Predicted: -5.620620413824785   Actual: -4.65
Predicted: -11.825474656756931   Actual: -10.92
Predicted: 15.70567395544858   Actual: 16.61
Predicted: 2.360065576644908   Actual: 3.35
Predicted: 1.5126315580578371   Actual: 2.39
Predicted: -2.9552933237892054   Actual: -2.29
Predicted: 9.425162268742255   Actual: 10.31
Predicted: -3.2152696253706585   Actual: -2.59
Predicted: -8.985111000104522   Actual: -8.23
Predicted: 1.4175593134783764   Actual: 2.49
Predicted: 1.6945836714876759   Actual: 2.63
Predicted: 8.093383836565126   Actual: 8.61
Predicted: -9.65952371692

In [5]:
# Linear closed form solution
phi = np.column_stack((np.ones(len(X_train)), X_train))
w_closed = np.matmul(np.matmul(np.linalg.inv(np.matmul(np.transpose(phi), phi)), np.transpose(phi)), y_train)

print(f'coefficients: {w_closed}\n')

X_test_closed = np.column_stack((np.ones(len(X_test)), X_test))
y_pred_closed = np.matmul(X_test_closed, w_closed)

# print the first 20 predictions
for i in range(20):
    print(f'Predicted: {y_pred_closed[i]}   Actual: {y_test[i]}')

# print the error
print(f'\nmean squared error: {mean_squared_error(y_test, y_pred_closed)}')

coefficients: [-2.82039109e+01 -2.79970026e-03  1.37760394e+00  3.68597197e-03
 -1.74190380e-03  3.32170445e-01  1.01779495e+00  1.38159824e+00
  1.01544405e+00  3.34552976e-01  3.50742582e-01]

Predicted: 12.43049081556034   Actual: 12.63
Predicted: -8.543971099105642   Actual: -8.58
Predicted: 1.8065647174072148   Actual: 1.98
Predicted: -0.7486928677998819   Actual: -0.46
Predicted: -5.145471632596559   Actual: -4.87
Predicted: 6.764636193175712   Actual: 6.95
Predicted: -4.758245413814468   Actual: -4.65
Predicted: -10.963099656751396   Actual: -10.92
Predicted: 16.56804895545535   Actual: 16.61
Predicted: 3.2224405766320894   Actual: 3.35
Predicted: 2.3750065580621005   Actual: 2.39
Predicted: -2.0929183237876594   Actual: -2.29
Predicted: 10.287537268758108   Actual: 10.31
Predicted: -2.3528946253717122   Actual: -2.59
Predicted: -8.122736000095232   Actual: -8.23
Predicted: 2.2799343134813226   Actual: 2.49
Predicted: 2.556958671482166   Actual: 2.63
Predicted: 8.955758836572821

__Comparing__

The predictions for sklearn, my own gradient descent, and the closed form solution were all very similar, and all did very well against the true data points. However, my own gradient descent did significantly worse than the other two, with an error of about 0.84 while the others had errors under 0.05. Even so, most of the errors were small, as indicated by the mean squared error metric.

__Part 2 (Polynomial Regression)__

In [14]:
# Quadratic regression with sklearn
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_quad = poly.fit_transform(X_train)
X_test_quad = poly.transform(X_test)

reg_quad = LinearRegression().fit(X_train_quad, y_train)
print(f'sklearn score: {reg_quad.score(X_train_quad, y_train)}')
print(f'sklearn coefficients (linear): {reg_quad.coef_}\n')

# printing the first 20 test samples for quadratic
y_pred_quad = reg_quad.predict(X_test_quad)
for i in range(20):
    print(f'Predicted: {y_pred_quad[i]}   Actual: {y_test[i]}')

print(f'\nModel score on test set: {reg_quad.score(X_test_quad, y_test)}')
print(f'Model error: {mean_squared_error(y_test, y_pred_quad)}')

sklearn score: 0.9992785463086552
sklearn coefficients (linear): [-3.22318033e-02  1.34416635e+00  3.60219336e-02 -1.03705437e-02
  3.62683756e-01  1.00994413e+00  1.41199967e+00  1.01951626e+00
  3.17833189e-01  4.37396059e-01 -1.61484183e-03  1.12321079e-03
  1.65168531e-03  9.60938169e-04  3.54511656e-03  1.13275062e-04
  1.16923730e-03 -2.30558687e-04  3.14329788e-03 -1.94559398e-03
 -1.63249421e-04 -3.27788795e-04  3.73290074e-03  1.96245117e-03
 -1.92336384e-04  6.68916187e-04  1.22173367e-03  1.74503013e-03
 -3.42358407e-03 -4.77160522e-04 -1.38556312e-03 -2.25714922e-03
 -1.47455249e-03 -1.20087547e-03  2.55613954e-03 -1.21421290e-03
 -1.95522643e-03  3.57623497e-04 -3.52029139e-03 -1.22977370e-04
  1.00292542e-03  2.95379260e-04  1.57373147e-03 -1.64749029e-03
 -3.11773433e-03  5.08785432e-03 -3.87928936e-03 -1.64359573e-04
 -1.36537691e-03  7.08882662e-04  1.65713780e-03 -1.87475585e-03
 -1.72119467e-03  1.60379661e-04 -1.47694530e-03 -1.09146949e-03
 -9.73710405e-04  8.70616

In [32]:
# with built polynomial (quadratic) regression
poly = PolynomialFeatures(2)
X_train_quad = poly.fit_transform(X_train)
X_test_quad = poly.transform(X_test)

# I dont think I should have this many iterations... but the function wouldnt work with a larger learning rate
w_quad = utils.gradient_descent(X_train_quad, y_train, learning_rate=0.00001, iters=40000)
print(f"output coefficients: {w_quad}\n")

y_pred_quad = np.matmul(X_test_quad, w_quad)
# print the first 20 predictions
for i in range(20):
    print(f'Predicted: {y_pred_quad[i]}   Actual: {y_test[i]}')

# print the error (definitely overfitting here or something)
print(f'\nmean squared error: {mean_squared_error(y_test, y_pred_quad)}')

output coefficients: [-1.70643009e-01 -4.42092101e-01 -2.81700525e-01 -4.17437413e-01
 -4.12070514e-01 -3.69164976e-01 -3.38679793e-01 -3.25967176e-01
 -3.39568921e-01 -3.43887434e-01 -3.80073327e-01 -3.69938605e-02
  2.54041071e-02 -8.03661229e-03 -6.61919430e-04  4.08715217e-02
 -1.05181886e-02  2.13348033e-02 -3.60809548e-05  4.60668160e-02
  3.40922650e-02 -1.20906981e-02  4.68426361e-02  3.00032737e-02
  5.35806528e-02  4.23521782e-02  4.60732871e-02  4.18647457e-02
  3.07148278e-02  1.31845957e-02 -4.24344048e-02  1.78299453e-02
 -3.75666045e-04  1.85913793e-02  6.44748254e-03  2.15657892e-02
  8.16010840e-03  3.87882228e-02 -4.81139092e-02  1.42388461e-02
  2.85472389e-02  1.44754835e-02  3.40132285e-02  4.03184436e-03
  1.12138314e-02 -5.49700140e-02  5.24739513e-02  3.35817987e-02
  1.94330188e-02  1.09931576e-02  1.61340692e-03  7.13543764e-03
  5.29891754e-02  1.36228392e-02  4.00238482e-02  7.50130558e-03
  1.86277009e-02  5.58432150e-02  1.30573068e-02  3.66519825e-02
  2.

In [7]:
# Quadratic with closed form
phi_quad = np.column_stack((np.ones(len(X_train_quad)), X_train_quad))
w_closed_quad = np.matmul(np.matmul(np.linalg.inv(np.matmul(phi_quad.T, phi_quad)), phi_quad.T), y_train)

print(f'coefficients: {w_closed_quad}\n')

X_test_closed_quad = np.column_stack((np.ones(len(X_test_quad)), X_test_quad))
y_pred_closed_quad = np.matmul(X_test_closed_quad, w_closed_quad)

# print the first 20 predictions
for i in range(20):
    print(f'Predicted: {y_pred_closed_quad[i]}   Actual: {y_test[i]}')

# print the error
print(f'\nmean squared error: {mean_squared_error(y_test, y_pred_closed_quad)}')

coefficients: [-2.83946202e+01 -3.22318033e-02  1.34416635e+00  3.60219336e-02
 -1.03705437e-02  3.62683756e-01  1.00994413e+00  1.41199967e+00
  1.01951626e+00  3.17833189e-01  4.37396059e-01 -1.61484183e-03
  1.12321079e-03  1.65168531e-03  9.60938169e-04  3.54511656e-03
  1.13275062e-04  1.16923730e-03 -2.30558687e-04  3.14329788e-03
 -1.94559398e-03 -1.63249421e-04 -3.27788795e-04  3.73290074e-03
  1.96245117e-03 -1.92336384e-04  6.68916187e-04  1.22173367e-03
  1.74503013e-03 -3.42358407e-03 -4.77160522e-04 -1.38556312e-03
 -2.25714922e-03 -1.47455249e-03 -1.20087547e-03  2.55613954e-03
 -1.21421290e-03 -1.95522643e-03  3.57623497e-04 -3.52029139e-03
 -1.22977370e-04  1.00292542e-03  2.95379260e-04  1.57373147e-03
 -1.64749029e-03 -3.11773433e-03  5.08785432e-03 -3.87928936e-03
 -1.64359573e-04 -1.36537691e-03  7.08882662e-04  1.65713780e-03
 -1.87475585e-03 -1.72119467e-03  1.60379661e-04 -1.47694530e-03
 -1.09146949e-03 -9.73710405e-04  8.70616334e-04  5.53466476e-04
  8.0498353

__Comparing__

The predictions for sklearn and the closed form solution were all very similar, and all did very well against the true data points. However, my own gradient descent did not work, and I kept getting errors when running the gradient descent. Even so, most of the errors for the models that did work were small, as indicated by the mean squared error metric.