# Data Preprocessing


## Initial Setup

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import math
from math import sqrt
import glob
import numpy as np
from numpy.random import seed
import pandas as pd
import csv
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

seed_number = 1
seed(seed_number)

## Loading Combined Data 1979- 2018

Features:
'wind_10m', 'specific_humidity', 'LW_down', 'SW_down', 'rainfall', 'snowfall', 'sosaline', 'sst', 't2m', 'surface_pressure','sea_ice_extent'




In [3]:
# Define paths to load data
path = '/content/drive/MyDrive/SYDE 675/Project/Data'
data_path = path + '/monthly_features.npy'
target_path = path + '/monthly_target.npy'

data = np.load(data_path, allow_pickle=True)
target = np.load(target_path, allow_pickle=True)

In [4]:
print("Shape of data = ", data.shape)
print("Shape of target data = ", target.shape)

Shape of data =  (480, 1, 11)
Shape of target data =  (480,)


### Adding a Lag to Y values
Here lag = 1 month


In [5]:
# Adding a lag to monthly targets
lag = 1
data = data[:-lag,:,:]
target = target[lag:]

print(data.shape)
print(target.shape)

(479, 1, 11)
(479,)


## Train Validation Split

In [7]:
# Sequential split train:val data in 80:20 sequentially 

LEN_DATA = len(data) # total number of datapoints

NUM_TRAIN = LEN_DATA - (60) # reserve last 60 months for testing
NUM_TEST = LEN_DATA - NUM_TRAIN

print('LEN_DATA:', LEN_DATA)
print('NUM_TRAIN:', NUM_TRAIN)
print('NUM_TEST:', NUM_TEST)

x_train = data[0 : NUM_TRAIN]
x_test = data[NUM_TRAIN :]

# Split features and labels
y_train = target[ : NUM_TRAIN] # target is last column i-e sea-ice
y_test = target[NUM_TRAIN : ] # target is last column i-e sea-ice

LEN_DATA: 479
NUM_TRAIN: 419
NUM_TEST: 60


In [9]:
print('x_train.shape:', x_train.shape)
print('y_train.shape:', y_train.shape)
print('x_test.shape:', x_test.shape)
print('y_test.shape:', y_test.shape)

x_train.shape: (419, 1, 11)
y_train.shape: (419,)
x_test.shape: (60, 1, 11)
y_test.shape: (60,)


## Normalization


In [11]:
# Normalize the features
scaler_f = StandardScaler()
x_train = scaler_f.fit_transform(x_train.reshape(-1, 11)) # reshaping to 2d for standard scaling
x_test = scaler_f.transform(x_test.reshape(-1, 11)) # reshaping to 2d for standard scaling

scaler_l = StandardScaler()
y_train = scaler_l.fit_transform(y_train.reshape(-1, 1)) # reshaping to 2d for standard scaling
y_test = scaler_l.transform(y_test.reshape(-1, 1)) # reshaping to 2d for standard scaling

In [13]:
print('x_train.shape:', x_train.shape)
print('y_train.shape:', y_train.shape)
print('x_test.shape:', x_test.shape)
print('y_test.shape:', y_test.shape)

x_train.shape: (419, 11)
y_train.shape: (419, 1)
x_test.shape: (60, 11)
y_test.shape: (60, 1)


# Linear Regression

In [14]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

## Model Predictions

In [15]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60, 1)
(419, 1)


In [23]:
# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [14262226.98923777]
Sampe of inverted actual values =  [13859281.]


In [24]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 519659.688
Test NRMSE: 0.04853880787557791
Test R_Square: 0.975


## Write scores in csv files

In [25]:
# Train the model 20 times
for i in range(20):
  
  model = LinearRegression()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)


  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/linear_reg_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [26]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,519659.7,0.04853881,0.9749176
std,0.512989,1.791594e-10,2.1357460000000002e-17,1.139065e-16
min,1.0,519659.7,0.04853881,0.9749176
25%,1.0,519659.7,0.04853881,0.9749176
50%,1.5,519659.7,0.04853881,0.9749176
75%,2.0,519659.7,0.04853881,0.9749176
max,2.0,519659.7,0.04853881,0.9749176


# Decision Trees

In [27]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

DecisionTreeRegressor()

## Model Predictions

In [28]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [35]:
# Reshape to make it 2D
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [14189868.]
Sampe of inverted actual values =  [13859281.]


In [36]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 516212.033
Test NRMSE: 0.04821677967885059
Test R_Square: 0.975


## Write scores in csv files

In [37]:
# Train the model 20 times
for i in range(20):
    
  model = DecisionTreeRegressor()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # Invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # Invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # Calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # Calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/dt_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [38]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,596290.126923,0.055696,0.966827
std,0.512989,40941.042154,0.003824,0.004414
min,1.0,507012.168464,0.047357,0.960586
25%,1.0,572302.255097,0.053456,0.963795
50%,1.5,614522.941218,0.0574,0.964924
75%,2.0,624332.858259,0.058316,0.96957
max,2.0,651418.380581,0.060846,0.976124


# Random Forest

In [40]:
model = RandomForestRegressor()
model.fit(x_train, y_train)

  


RandomForestRegressor()

## Model Predictions

In [41]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [48]:
# Reshape to make it 2D
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [14060204.44]
Sampe of inverted actual values =  [13859281.]


In [49]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 457958.943
Test NRMSE: 0.04277565038201116
Test R_Square: 0.981


## Write scores in csv files

In [51]:
# Train the model 20 times
for i in range(20):
  
  model = RandomForestRegressor()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)


  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/rf_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  """


In [52]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,454691.037076,0.04247,0.980794
std,0.512989,6433.258528,0.000601,0.000545
min,1.0,445023.934462,0.041567,0.979794
25%,1.0,449120.096046,0.04195,0.980305
50%,1.5,453586.769711,0.042367,0.98089
75%,2.0,460480.293384,0.043011,0.981265
max,2.0,466413.863629,0.043565,0.981605


# Gradient Boosting Regressor

In [54]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor()

## Model Predictions

In [55]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [63]:
# Reshape to make it 2D
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [13969722.09091432]
Sampe of inverted actual values =  [13859281.]


In [64]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 519576.532
Test NRMSE: 0.04853104071077857
Test R_Square: 0.975


## Write scores in csv files

In [66]:
# Train the model 20 times
for i in range(1):
  
  model = GradientBoostingRegressor()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/boost_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  y = column_or_1d(y, warn=True)


In [67]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,519234.459231,0.048499,0.974958
std,0.512989,2261.805441,0.000211,0.000218
min,1.0,514729.996686,0.048078,0.974545
25%,1.0,518090.767461,0.048392,0.974835
50%,1.5,518774.779959,0.048456,0.975003
75%,2.0,520516.797556,0.048619,0.975069
max,2.0,523507.761067,0.048898,0.975391


# XGBoost

In [69]:
model = XGBRegressor(objective = "reg:squarederror")
model.fit(x_train, y_train)

XGBRegressor(objective='reg:squarederror')

## Model Predictions

In [70]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [78]:
# Reshape to make it 2D
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [14166947.]
Sampe of inverted actual values =  [13859281.]


In [79]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 476411.340
Test NRMSE: 0.04449919640369534
Test R_Square: 0.979


## Write scores in csv files

In [81]:
# Train the model 20 times
for i in range(20):
  
  model = XGBRegressor(objective ='reg:squarederror')
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/xgboost_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [82]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,476411.3,0.0444992,0.9789188
std,0.512989,1.791594e-10,7.119155e-18,2.27813e-16
min,1.0,476411.3,0.0444992,0.9789188
25%,1.0,476411.3,0.0444992,0.9789188
50%,1.5,476411.3,0.0444992,0.9789188
75%,2.0,476411.3,0.0444992,0.9789188
max,2.0,476411.3,0.0444992,0.9789188


# Polynomial Regression

In [83]:
degree = 2
model = make_pipeline(PolynomialFeatures(degree),LinearRegression())
model.fit(x_train, y_train)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

## Model Predictions

In [84]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60, 1)
(419, 1)


In [92]:
# Reshape to make it 2D
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [14120415.99889667]
Sampe of inverted actual values =  [13859281.]


In [93]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 889224.778
Test NRMSE: 0.08305803135680728
Test R_Square: 0.927


## Write scores in csv files

In [95]:
# Train the model 20 times
for i in range(20):
  
  degree = 2
  model = make_pipeline(PolynomialFeatures(degree),LinearRegression())
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/poly_reg_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [96]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,889224.8,0.08305803,0.9265563
std,0.512989,1.194396e-10,1.423831e-17,1.139065e-16
min,1.0,889224.8,0.08305803,0.9265563
25%,1.0,889224.8,0.08305803,0.9265563
50%,1.5,889224.8,0.08305803,0.9265563
75%,2.0,889224.8,0.08305803,0.9265563
max,2.0,889224.8,0.08305803,0.9265563


# SVM Regression

In [97]:
model = SVR()
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVR()

## Model Predictions

In [98]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [107]:
# Reshape to make it 2D
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [14195292.86620352]
Sampe of inverted actual values =  [13859281.]


In [108]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 580888.465
Test NRMSE: 0.054257881188535924
Test R_Square: 0.969


## Write scores in csv files

In [109]:
# Train the model 20 times
for i in range(20):
  
  model = SVR()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/svr_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  y = column_or_1d(y, warn=True)


In [110]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,580888.5,0.054258,0.9686587
std,0.512989,1.194396e-10,0.0,1.139065e-16
min,1.0,580888.5,0.054258,0.9686587
25%,1.0,580888.5,0.054258,0.9686587
50%,1.5,580888.5,0.054258,0.9686587
75%,2.0,580888.5,0.054258,0.9686587
max,2.0,580888.5,0.054258,0.9686587


# Ridge Regression

In [111]:
model = Ridge()
model.fit(x_train, y_train)

Ridge()

## Model Predictions

In [112]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60, 1)
(419, 1)


In [120]:
# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [14250272.46301217]
Sampe of inverted actual values =  [13859281.]


In [121]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 521187.258
Test NRMSE: 0.04868149050308042
Test R_Square: 0.975


## Write scores in csv files

In [122]:
# Train the model 20 times
for i in range(20):
  
  model = Ridge()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/ridge_reg_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [123]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,521187.3,0.04868149,0.9747699
std,0.512989,2.388792e-10,2.1357460000000002e-17,2.27813e-16
min,1.0,521187.3,0.04868149,0.9747699
25%,1.0,521187.3,0.04868149,0.9747699
50%,1.5,521187.3,0.04868149,0.9747699
75%,2.0,521187.3,0.04868149,0.9747699
max,2.0,521187.3,0.04868149,0.9747699


# Lasso Regression

In [124]:
model = Lasso()
model.fit(x_train, y_train)

Lasso()

## Model Predictions

In [125]:
trainPred = model.predict(x_train)
testPred = model.predict(x_test)

print(testPred.shape)
print(trainPred.shape)

(60,)
(419,)


In [133]:
# Reshape to make it 2D
trainPred = trainPred.reshape(-1, 1)
testPred = testPred.reshape(-1, 1)

# Invert scaling for forecasted values 
inv_testPred = scaler_l.inverse_transform(testPred)
print("Sample of inverted predited values = ", inv_testPred[0])

# Invert scaling for actual values
inv_y_test = scaler_l.inverse_transform(y_test)
print("Sampe of inverted actual values = ", inv_y_test[0])

Sample of inverted predited values =  [11827175.3627685]
Sampe of inverted actual values =  [13859281.]


In [134]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))
print('Test RMSE: %.3f' % rmse)

# Calculate Normalized RMSE
y_max = y_test.max()
y_min = y_test.min()
nrmse = rmse /(inv_y_test.mean()) 
print('Test NRMSE:', nrmse)

# Calculate R-square
r_sq = r2_score(inv_y_test, inv_testPred)
print('Test R_Square: %.3f' % r_sq)

Test RMSE: 3467454.744
Test NRMSE: 0.3238775751883375
Test R_Square: -0.117


## Write scores in csv files

In [135]:
# Train the model 20 times
for i in range(20):
  
  model = Lasso()
  model.fit(x_train, y_train)

  trainPred = model.predict(x_train)
  testPred = model.predict(x_test)  

  trainPred = trainPred.reshape(-1, 1)
  testPred = testPred.reshape(-1, 1)

  # invert scaling for forecasted values 
  inv_testPred = scaler_l.inverse_transform(testPred)

  # invert scaling for actual values
  inv_y_test = scaler_l.inverse_transform(y_test)

  # Calculate RMSE
  rmse = sqrt(mean_squared_error(inv_y_test, inv_testPred))

  # calculate Normalized RMSE
  y_max = inv_y_test.max()
  y_min = inv_y_test.min()
  nrmse = rmse /(inv_y_test.mean()) 

  # calculate R-square
  r_sq = r2_score(inv_y_test, inv_testPred)

  file_path = '/content/drive/MyDrive/SYDE 675/Project/Models/m-ML_models/lasso_reg_scores.csv'

  header = ['seed', 'Test RMSE', 'Test normalized RMSE', 'R_Square']

  if os.path.exists(file_path):
    # If file exists, append row
    with open(file_path, 'a', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

  else:
    # If file does not exist, write new file
    with open(file_path, 'w', encoding='UTF8') as f:
      writer = csv.writer(f)

      # write the header
      writer.writerow(header)

      # write the data
      writer.writerow([seed_number, rmse, nrmse, r_sq])

In [136]:
results = pd.read_csv(file_path)
results.describe()

Unnamed: 0,seed,Test RMSE,Test normalized RMSE,R_Square
count,20.0,20.0,20.0,20.0
mean,1.5,3467455.0,0.3238776,-0.116742
std,0.512989,1.433275e-09,5.695324e-17,0.0
min,1.0,3467455.0,0.3238776,-0.116742
25%,1.0,3467455.0,0.3238776,-0.116742
50%,1.5,3467455.0,0.3238776,-0.116742
75%,2.0,3467455.0,0.3238776,-0.116742
max,2.0,3467455.0,0.3238776,-0.116742
