Importing required libraries

In [75]:
import pickle
import random
import matplotlib.pyplot as plt
import tabulate
import numpy
import sklearn.preprocessing
import sklearn.linear_model

Loading data from train and test files

In [76]:
with open('./data/test.pkl','rb') as test_data_file:
    test = pickle.load(test_data_file)
with open('./data/train.pkl','rb') as train_data_file:
    train = pickle.load(train_data_file)

Declaring list variables to store the data

In [77]:
bias_list = []
bias_square_list = []
mse_list = []
err_list = []
var_list = []

Shuffling fetched data and splitting into 16 datasets

In [78]:
random.shuffle(train)
m= numpy.array_split(train,16)

Caluclating the bias, variance, error and mse for each polynomial degree after finding the predicted values for each model

In [79]:
for i in range(1,16,1):
  predicted_values_list = []
  polynomial = sklearn.preprocessing.PolynomialFeatures(i)

  for j in range(16):
    inp_train, y_train = numpy.hsplit(m[j],2)
    inp_test, y_test = numpy.hsplit(test,2)
    deg_test = polynomial.fit_transform(inp_test)
    reg = sklearn.linear_model.LinearRegression().fit(polynomial.fit_transform(inp_train), y_train)
    predicted_values_list.append(reg.predict(deg_test))

  bias = abs(numpy.mean(predicted_values_list, axis = 0) - y_test)
  bias_avg = numpy.mean(bias)
  bias_square_avg = numpy.mean(numpy.square(numpy.mean(predicted_values_list, axis = 0) - y_test))  

  variance = numpy.var(predicted_values_list, axis = 0)
  variance_avg = numpy.mean(variance)
  
  mse = numpy.mean((numpy.square(predicted_values_list - y_test)), axis=0)
  mse_avg = numpy.mean(mse)  

  error = mse_avg - (bias_square_avg + variance_avg)
  error_avg = numpy.mean(error)

  bias_list.append(bias_avg) 
  bias_square_list.append(bias_square_avg)
  
  var_list.append(variance_avg)
  mse_list.append(mse_avg)
  err_list.append(error_avg)

Printing Bias, Bias Square, Variance, Irreducible Error and MSE for each model.

In [None]:
print("Bias:")
print(bias_list)
print()
print("Bias Square:")
print(bias_square_list)
print()
print("Variance:")
print(var_list)
print()
print("Mean Square Error or Total Error:")
print(mse_list)
print()
print("Irreducible Error:")
print(err_list)

Tabulating the data

In [None]:
index_for_deg = []
for i in range(0,15):
    index_for_deg.append(i+1)

table = numpy.array([index_for_deg, bias_list, bias_square_list, var_list, mse_list, err_list])
print(tabulate.tabulate(table.T, headers=["Degree", "Bias", "Bias square", "Variance", "Mean Square Error", "Irreducible Error"]))

Plotting Bias<sup>2</sup>  vs Variance Graph

In [None]:
f1 = plt.figure(1)
plt.title('Graph')
plt.xlabel('Model Complexity')
plt.ylabel('Error')
plt.plot(range(1,16),bias_square_list ,'blue', label = 'bias_square')
plt.plot(range(1,16),var_list , 'red', label='variance')
plt.plot(range(1,16),mse_list ,'green' , label = 'Error')
plt.legend()
plt.show()