<a href="https://colab.research.google.com/github/zach-gousseau/syde675-project/blob/main/models/675_Project_Polar_Sea_Ice_Prediction_mML_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preprocessing


## Polar Sea Ice Prediction

In [3]:
pip install attention

Collecting attention
  Downloading attention-4.1-py3-none-any.whl (8.6 kB)
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 7.1 MB/s 
Installing collected packages: tf-estimator-nightly, attention
Successfully installed attention-4.1 tf-estimator-nightly-2.8.0.dev2021122109


## Initial Setup

In [87]:
import os
import math
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
# import tensorflow.keras
from tensorflow.keras.optimizers import Adam
from attention import Attention
from keras.layers import Dense, Dropout
from keras.layers import LSTM,TimeDistributed
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
from numpy.random import seed
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

import os
import csv

seed_number = 2
seed(seed_number)

## Loading Combined Data 1979- 2018

Features:
'wind_10m', 'specific_humidity', 'LW_down', 'SW_down', 'rainfall', 'snowfall', 'sosaline', 'sst', 't2m', 'surface_pressure','sea_ice_extent'




In [88]:
path = '/content/drive/MyDrive/SYDE 675/Project/Data'
data_path = path + '/monthly_features.npy'
target_path = path + '/monthly_target.npy'

In [89]:
# data = np.load('/.../monthly_features.npy',allow_pickle=True)
# target = np.load('/.../monthly_target.npy',allow_pickle=True)

data = np.load(data_path, allow_pickle=True)
target = np.load(target_path, allow_pickle=True)

### Adding a Lag to Y values
Here lag = 1 month


In [90]:
#Adding a lag to monthly targets
lag = 1
data = data[:-lag,:,:]
target = target[lag:]

print(data.shape)
print(target.shape)


(479, 1, 11)
(479,)


## Train Validation Split

LSTM network expects the input data to be provided with a specific array structure in the form of: [samples, time steps, features]. We load the csv file and only retain the feature and target columns. The features and target are stored in separate np arrays.

In [91]:
# Sequential split train:val data in 80:20 sequentially 

LEN_DATA = len(data) #total number of pixels

NUM_TRAIN = LEN_DATA - (60) #reserve last 30 months for testing
NUM_TEST = LEN_DATA - NUM_TRAIN

print('LEN_DATA:',LEN_DATA)
print('NUM_TRAIN:',NUM_TRAIN)
print('NUM_TEST:',NUM_TEST)

x_train = data[0:NUM_TRAIN]
x_test = data[NUM_TRAIN:]

#split features and labels
y_train = target[:NUM_TRAIN] #target is last column i-e sea-ice
y_test = target[NUM_TRAIN:] #target is last column i-e sea-ice


LEN_DATA: 479
NUM_TRAIN: 419
NUM_TEST: 60


In [92]:
print('x_train.shape:',x_train.shape)
print('y_train.shape:',y_train.shape)
print('x_test.shape:',x_test.shape)
print('y_test.shape:',y_test.shape)

x_train.shape: (419, 1, 11)
y_train.shape: (419,)
x_test.shape: (60, 1, 11)
y_test.shape: (60,)


## Reshaping Input and Target Features

In [93]:
# # convert an array of values into a dataset matrix
# def reshape_features(dataset, timesteps=1):
#     print(dataset.shape)
#     X = dataset.reshape((int(dataset.shape[0]/timesteps)), dataset.shape[2])
#     return X

## Normalization


In [94]:
# normalize the features

scaler_f = StandardScaler()
x_train = scaler_f.fit_transform(x_train.reshape(-1,11)) #reshaping to 2d for standard scaling
x_test = scaler_f.transform(x_test.reshape(-1,11)) #reshaping to 2d for standard scaling

scaler_l = StandardScaler()
y_train = scaler_l.fit_transform(y_train.reshape(-1,1)) #reshaping to 2d for standard scaling
y_test = scaler_l.transform(y_test.reshape(-1,1)) #reshaping to 2d for standard scaling


In [95]:
# #Reshaping data to 3D for modeling
# timesteps = 1
# x_train = reshape_features(x_train, timesteps) # reshaping to 3d for model
# x_test = reshape_features(x_test, timesteps) # reshaping to 3d for model


In [96]:
print('x_train.shape:',x_train.shape)
print('y_train.shape:',y_train.shape)
print('x_test.shape:',x_test.shape)
print('y_test.shape:',y_test.shape)

x_train.shape: (419, 11)
y_train.shape: (419, 1)
x_test.shape: (60, 11)
y_test.shape: (60, 1)


In [97]:
extent = target
print(extent.shape)

(479,)


# Linear Regression

In [98]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

## Model Predictions

In [99]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

In [100]:
print(testPred.shape)
print(trainPred.shape)

(60, 1)
(419, 1)


In [101]:
# #Reverting data back to 2D from 3D
# x_test_t = x_test.reshape((x_test.shape[0], x_test.shape[2]))
# print(x_test_t.shape)
# print(testPred.shape)

In [102]:
# invert scaling for forecasted values 

inv_testPred = scaler_l.inverse_transform(testPred)
print(inv_testPred[1])

# invert scaling for actual values

inv_y_test = scaler_l.inverse_transform(y_test)
print(inv_y_test[1])


[14913891.82438048]
[14545530.]


In [103]:
# calculate RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 519659.688


In [104]:
# calculate Normalized RMSE
y_max = inv_y_test.max()
y_min = inv_y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

Test NRMSE: 0.04853880787557791


In [105]:
inv_y_test.mean()

10706066.15

In [106]:
# calculate R-square
from sklearn.metrics import r2_score
from math import sqrt

r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.4f' % r_sq)

Test R_Square: 0.9749


## Write scores in csv files

In [107]:


for i in range(2):
  
  model = LinearRegression()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)


  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)


  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/linear_reg_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [108]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,4.0,4.0,4.0,4.0
mean,1.5,519659.687958,0.048539,0.974918
std,0.57735,0.0,0.0,0.0
min,1.0,519659.687958,0.048539,0.974918
25%,1.0,519659.687958,0.048539,0.974918
50%,1.5,519659.687958,0.048539,0.974918
75%,2.0,519659.687958,0.048539,0.974918
max,2.0,519659.687958,0.048539,0.974918


# Decision Trees

In [109]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

DecisionTreeRegressor()

## Model Predictions

In [110]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

In [111]:
print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [112]:
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

In [113]:
# invert scaling for forecasted values 

inv_testPred = scaler_l.inverse_transform(testPred)
print(inv_testPred[1])

# invert scaling for actual values

inv_y_test = scaler_l.inverse_transform(y_test)
print(inv_y_test[1])


[15068129.]
[14545530.]


In [114]:
# calculate RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 561535.789


In [115]:
# calculate Normalized RMSE
y_max = inv_y_test.max()
y_min = inv_y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

Test NRMSE: 0.05245024463376686


In [116]:
inv_y_test.mean()

10706066.15

In [117]:
# calculate R-square
from sklearn.metrics import r2_score
from math import sqrt

r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.4f' % r_sq)

Test R_Square: 0.9707


## Write scores in csv files

In [118]:


for i in range(10):
  
  model = DecisionTreeRegressor()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)


  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/dt_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [119]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,596290.126923,0.055696,0.966827
std,0.512989,40941.042154,0.003824,0.004414
min,1.0,507012.168464,0.047357,0.960586
25%,1.0,572302.255097,0.053456,0.963795
50%,1.5,614522.941218,0.0574,0.964924
75%,2.0,624332.858259,0.058316,0.96957
max,2.0,651418.380581,0.060846,0.976124


# Random Forest

In [120]:
from sklearn.ensemble import RandomForestRegressor

In [121]:
# y_train = y_train.reshape(-1)

In [122]:
model = RandomForestRegressor()
model.fit(x_train, y_train)

  


RandomForestRegressor()

## Model Predictions

In [123]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

In [124]:
print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [125]:
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

In [126]:
# invert scaling for forecasted values 

inv_testPred = scaler_l.inverse_transform(testPred)
print(inv_testPred[1])

# invert scaling for actual values

inv_y_test = scaler_l.inverse_transform(y_test)
print(inv_y_test[1])


[14729127.73]
[14545530.]


In [127]:
# calculate RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 468143.453


In [128]:
# calculate Normalized RMSE
y_max = inv_y_test.max()
y_min = inv_y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

Test NRMSE: 0.043726934459623126


In [129]:
inv_y_test.mean()

10706066.15

In [130]:
# calculate R-square
from sklearn.metrics import r2_score
from math import sqrt

r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.4f' % r_sq)

Test R_Square: 0.9796


## Write scores in csv files

In [131]:

# y_train = y_train.reshape(-1)

for i in range(10):
  
  model = RandomForestRegressor()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)


  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/rf_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys


In [132]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,454531.921059,0.042456,0.980808
std,0.512989,5685.518273,0.000531,0.000481
min,1.0,444400.341088,0.041509,0.979966
25%,1.0,451013.911734,0.042127,0.980361
50%,1.5,453336.751566,0.042344,0.980911
75%,2.0,459820.778017,0.04295,0.981107
max,2.0,464432.948052,0.04338,0.981657


# Gradient Boosting Regressor

In [133]:
from sklearn.ensemble import GradientBoostingRegressor

In [134]:
# y_train = y_train.reshape(-1)

In [135]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor()

## Model Predictions

In [136]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

In [137]:
print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [138]:
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

In [139]:
# invert scaling for forecasted values 

inv_testPred = scaler_l.inverse_transform(testPred)
print(inv_testPred[1])

# invert scaling for actual values

inv_y_test = scaler_l.inverse_transform(y_test)
print(inv_y_test[1])


[14775040.95521954]
[14545530.]


In [140]:
# calculate RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 519335.844


In [141]:
# calculate Normalized RMSE
y_max = inv_y_test.max()
y_min = inv_y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

Test NRMSE: 0.04850855922600896


In [142]:
inv_y_test.mean()

10706066.15

In [143]:
# calculate R-square
from sklearn.metrics import r2_score
from math import sqrt

r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.4f' % r_sq)

Test R_Square: 0.9749


## Write scores in csv files

In [144]:

# y_train = y_train.reshape(-1)

for i in range(10):
  
  model = GradientBoostingRegressor()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)


  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/boost_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [145]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,519832.217114,0.048555,0.974901
std,0.512989,1924.986891,0.00018,0.000186
min,1.0,515505.461543,0.048151,0.974614
25%,1.0,518922.586914,0.04847,0.974788
50%,1.5,519787.274585,0.048551,0.974905
75%,2.0,521004.656793,0.048664,0.974989
max,2.0,522797.759673,0.048832,0.975317


# XGBoost

In [146]:
from xgboost import XGBRegressor

In [147]:
# y_train = y_train.reshape(-1)

In [148]:
model = XGBRegressor(objective = "reg:squarederror")
model.fit(x_train, y_train)

XGBRegressor(objective='reg:squarederror')

## Model Predictions

In [149]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

In [150]:
print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [151]:
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

In [152]:
# invert scaling for forecasted values 

inv_testPred = scaler_l.inverse_transform(testPred)
print(inv_testPred[1])

# invert scaling for actual values

inv_y_test = scaler_l.inverse_transform(y_test)
print(inv_y_test[1])


[14815781.]
[14545530.]


In [153]:
# calculate RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 476411.340


In [154]:
# calculate Normalized RMSE
y_max = inv_y_test.max()
y_min = inv_y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

Test NRMSE: 0.04449919640369534


In [155]:
inv_y_test.mean()

10706066.15

In [156]:
# calculate R-square
from sklearn.metrics import r2_score
from math import sqrt

r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.4f' % r_sq)

Test R_Square: 0.9789


## Write scores in csv files

In [157]:

# y_train = y_train.reshape(-1)

for i in range(10):
  
  model = XGBRegressor(objective ='reg:squarederror')
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)


  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/xgboost_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [158]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,476411.3,0.0444992,0.9789188
std,0.512989,1.791594e-10,7.119155e-18,2.27813e-16
min,1.0,476411.3,0.0444992,0.9789188
25%,1.0,476411.3,0.0444992,0.9789188
50%,1.5,476411.3,0.0444992,0.9789188
75%,2.0,476411.3,0.0444992,0.9789188
max,2.0,476411.3,0.0444992,0.9789188


# Polynomial Regression

In [159]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

degree = 2
model = make_pipeline(PolynomialFeatures(degree),LinearRegression())
model.fit(x_train, y_train)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

## Model Predictions

In [160]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

In [161]:
print(testPred.shape)
print(trainPred.shape)

(60, 1)
(419, 1)


In [162]:
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

In [163]:
# invert scaling for forecasted values 

inv_testPred = scaler_l.inverse_transform(testPred)
print(inv_testPred[1])

# invert scaling for actual values

inv_y_test = scaler_l.inverse_transform(y_test)
print(inv_y_test[1])


[15058585.41367294]
[14545530.]


In [164]:
# calculate RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 889224.778


In [165]:
# calculate Normalized RMSE
y_max = inv_y_test.max()
y_min = inv_y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

Test NRMSE: 0.08305803135680728


In [166]:
inv_y_test.mean()

10706066.15

In [167]:
# calculate R-square
from sklearn.metrics import r2_score
from math import sqrt

r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.4f' % r_sq)

Test R_Square: 0.9266


## Write scores in csv files

In [171]:

# y_train = y_train.reshape(-1)

for i in range(10):
  
  degree = 2
  model = make_pipeline(PolynomialFeatures(degree),LinearRegression())
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)


  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/poly_reg_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [169]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,33096890.0,3.091415,-100.7433
std,0.512989,1.528827e-08,9.112518e-16,1.458003e-14
min,1.0,33096890.0,3.091415,-100.7433
25%,1.0,33096890.0,3.091415,-100.7433
50%,1.5,33096890.0,3.091415,-100.7433
75%,2.0,33096890.0,3.091415,-100.7433
max,2.0,33096890.0,3.091415,-100.7433
