In [1]:
import numpy as np
%matplotlib widget
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu,linear
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

from assigment_utils import *
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

tf.keras.backend.set_floatx('float64')

tf.autograph.set_verbosity(0)

## Evaluating a Learning Algorithm (Polynomial Regression)

### 1 Splitting your data set

In [2]:
def gen_data(m, seed=1, scale=0.7):
    """ generate a data set based on a x^2 with added noise """
    c = 0
    x_train = np.linspace(0,49,m)
    np.random.seed(seed)
    y_ideal = x_train**2 + c
    y_train = y_ideal + scale * y_ideal*(np.random.sample((m,))-0.5)
    x_ideal = x_train #for redraw when new data included in X
    return x_train, y_train, x_ideal, y_ideal


In [3]:
X,y,x_ideal,y_ideal = gen_data(18,2,0.7)
print('X.shape', X.shape, 'y.shape',y.shape)


X.shape (18,) y.shape (18,)


In [4]:
#splitting the data
#ie. test size = 30 % and rest will be trainig data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=1)
print("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
print("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)

X_train.shape (12,) y_train.shape (12,)
X_test.shape (6,) y_test.shape (6,)


In [5]:
fig, ax = plt.subplots(1,1,figsize=(4,4))
ax.plot(x_ideal, y_ideal, "--", color = "orangered", label="y_ideal", lw=1)
ax.set_title("Training, Test",fontsize = 14)
ax.set_xlabel("x")
ax.set_ylabel("y")

ax.scatter(X_train, y_train, color = "red",           label="train")
ax.scatter(X_test, y_test,   color = dlc["dlblue"],   label="test")
ax.legend(loc='upper left')
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### 2) Error calculation for model evaluation, linear regression

$$ J_\text{test}(\mathbf{w},b) = 
            \frac{1}{2m_\text{test}}\sum_{i=0}^{m_\text{test}-1} ( f_{\mathbf{w},b}(\mathbf{x}^{(i)}_\text{test}) - y^{(i)}_\text{test} )^2 
            \tag{1}
$$

In [7]:
def eval_mse(y, yhat):
    m = len(y)
    err = 0.0
    for i in range(m):
        err += (yhat[i] - y[i]) ** 2
    err = err / (2 * m)
    
    return(err)

### 3) Compare performance on training and test data

In [8]:
#polynomial degree
degree = 10
#linear model with degree
model = lin_model(degree)
model.fit(X_train,y_train)

In [9]:
#prediction on training data
yhat = model.predict(X_train)
#training error
err_train = model.mse(y_train,yhat)

In [11]:
#prediction on test data
yhat = model.predict(X_test)
err_test = model.mse(y_test,yhat)

In [12]:
print(f"training err {err_train:0.2f}, test err {err_test:0.2f}")

training err 58.01, test err 171215.01


Error above shows that model fits the training data well but it does fits the test data well as its error is high.
So this is the case of high Variance / Overfitting

In [14]:
# plot predictions over data range 
x = np.linspace(0,int(X.max()),100)  # predict values for plot
y_pred = model.predict(x).reshape(-1,1)

plt_train_test(X_train, y_train, X_test, y_test, x, y_pred, x_ideal, y_ideal, degree)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### 4) Splitting data into 3 subsets
If you use the test error to guide improvements in the model, then the model will perform well on the test data... but the test data was meant to represent *new* data.
You need yet another set of data to test new data performance.
    1. Training Data 
    2. Cross Validation Data
    3. Test Data

In [15]:
# Generate  data
X,y, x_ideal,y_ideal = gen_data(40, 5, 0.7)
print("X.shape", X.shape, "y.shape", y.shape)

X.shape (40,) y.shape (40,)


In [16]:

#split the data using sklearn routine 
X_train, X_, y_train, y_ = train_test_split(X,y,test_size=0.40, random_state=1)
X_cv, X_test, y_cv, y_test = train_test_split(X_,y_,test_size=0.50, random_state=1)
print("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
print("X_cv.shape", X_cv.shape, "y_cv.shape", y_cv.shape)
print("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)

X_train.shape (24,) y_train.shape (24,)
X_cv.shape (8,) y_cv.shape (8,)
X_test.shape (8,) y_test.shape (8,)


### 5) Finding the optimal degree

Here we will try to use different degrees form 1 to 9 to fit the data

In [21]:
max_degree = 9
#for training error
err_train = np.zeros(max_degree)
#for CV error
err_cv = np.zeros(max_degree)

x = np.linspace(0,int(X.max()),100)  
y_pred = np.zeros((100,max_degree))  #columns are lines to plot

for degree in range(max_degree):
    #will start form 1 to 9
                #as degree is staring from 0 so added 1
    model = lin_model(degree+1)
    #fitting model on data
    model.fit(X_train,y_train)
    #making prediciton on training data
    yhat = model.predict(X_train)
    #training data error
    err_train[degree] = model.mse(y_train,yhat)
    
    #making prediction cv data
    yhat = model.predict(X_cv)
    #cv data error
    err_cv[degree] = model.mse(y_cv,yhat)
    
    y_pred[:,degree] = model.predict(x)
optimal_degree = np.argmin(err_cv) + 1
    

In [22]:
plt.close("all")
plt_optimal_degree(X_train, y_train, X_cv, y_cv, x, y_pred, x_ideal, y_ideal, 
                   err_train, err_cv, optimal_degree, max_degree)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …