# Load Libraries

In [1]:
# General libraries
import pandas as pd
import numpy as np
from typing import List, Tuple

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Models
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Error calculation
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load Dataset

In [2]:
def load_csv(filepath: str) -> pd.DataFrame:
  return pd.read_csv(filepath, sep='\t', dtype={'Comp': str}, index_col=0)

# Datasets
dataset_before_21_filepath = 'dataset_before_21.txt'
dataset_after_20_filepath = 'dataset_after_20.txt'

df = load_csv(dataset_before_21_filepath)
df_uwy_after_20 = load_csv(dataset_after_20_filepath)

In [3]:
print("To remove outliers, filter data to keep only ULR/ALR below {}".format(
    df['ULR/ALR'].std() * 2
))

df_no_ol = df[df['ULR/ALR'] < df['ULR/ALR'].std() * 2].copy()

To remove outliers, filter data to keep only ULR/ALR below 1.934359979270388


# Separating train and test sets

In [4]:
def prepare_data_for_X_y(
    df: pd.DataFrame,
    target_col: str,
    exclude_cols: List[str]) -> Tuple[pd.DataFrame, pd.Series]:

    # Drop unnecessary columns
    df_prep = df.drop(columns=exclude_cols).copy()

    # Create list of all columns that are string, to be used when
    # creating dummy columns
    cat_cols = df_prep.select_dtypes(include=['object']).columns.values

    # Create the dummy columns
    X = pd.get_dummies(data=df_prep, columns=cat_cols)
    
    # Create X_final (DataFrame), dropping target column
    X_final = X.drop(columns=[target_col])
    
    # Create y_final (Series), by keeping ony the target column
    y_final = X[target_col]

    # Return DataFrame and Series as tuple
    return X_final, y_final

# Prep data for dataset with year < 2021
X, y = prepare_data_for_X_y(df_no_ol,
                            target_col='ULR/ALR',
                            exclude_cols=['Class', 'UW Yr'])


# Prep data for dataset with year > 2020
X_21up, y_21up = prepare_data_for_X_y(df_uwy_after_20,
                                      target_col='ULR/ALR',
                                      exclude_cols=['Class', 'UW Yr'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=20)


# SVR

In [5]:
# Scaling numeric features using sklearn StandardScaler
# This is used only for SVR, since DT and RF have no need to scale the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_21up_test = sc.transform(X_21up)

# Create SVR instance, and fit dataset to algorithm
svr_clf = svm.SVR(kernel='rbf', gamma='auto', C=1)
svr_clf.fit(X_train, y_train)

svr_y_pred = svr_clf.predict(X_test)
print("Dataset with data before 2021")
print("RMSE", "{:.2%}".format(np.sqrt(mean_squared_error(y_test, svr_y_pred))))
print("MAE", "{:.2%}".format(mean_absolute_error(y_test, svr_y_pred)))

svr_y_pred_21up = svr_clf.predict(X_21up_test)
print("\nDataset with data after 2020")
print("RMSE", "{:.2%}".format(
    np.sqrt(mean_squared_error(y_21up, svr_y_pred_21up))))
print("MAE", "{:.2%}".format(mean_absolute_error(y_21up, svr_y_pred_21up)))


Dataset with data before 2021
RMSE 28.77%
MAE 18.73%

Dataset with data after 2020
RMSE 13.03%
MAE 10.33%


# Decision Tree

In [6]:
# Define again X and y because DT and RF don't need normalized data
X, y = prepare_data_for_X_y(df_no_ol,
                            target_col='ULR/ALR',
                            exclude_cols=['Class', 'UW Yr'])

X_21up, y_21up = prepare_data_for_X_y(df_uwy_after_20,
                                      target_col='ULR/ALR',
                                      exclude_cols=['Class', 'UW Yr'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=20)


In [7]:
# Define again X and y because DT and RF don't need normalized data
X, y = prepare_data_for_X_y(df_no_ol,
                            target_col='ULR/ALR',
                            exclude_cols=['Class', 'UW Yr'])

X_21up, y_21up = prepare_data_for_X_y(df_uwy_after_20,
                                      target_col='ULR/ALR',
                                      exclude_cols=['Class', 'UW Yr'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=20)

dt_reg = DecisionTreeRegressor(max_depth=6, max_features=10, random_state=20)
dt_reg.fit(X_train, y_train.values.ravel())
dt_reg_y_pred = dt_reg.predict(X_test)

In [8]:

print("Dataset with data before 2021")
print("RMSE: {:.2%}".format(
    np.sqrt(mean_squared_error(y_test, dt_reg_y_pred))))
print("MAE: {:.2%}".format(mean_absolute_error(y_test, dt_reg_y_pred)))
print('')
dt_reg_y_pred_21up = dt_reg.predict(X_21up)
print("\nDataset with data after 2020")
print("RMSE", "{:.2%}".format(
    np.sqrt(mean_squared_error(y_21up, dt_reg_y_pred_21up))))
print("MAE", "{:.2%}".format(mean_absolute_error(y_21up, dt_reg_y_pred_21up)))


Dataset with data before 2021
RMSE: 28.79%
MAE: 20.11%


Dataset with data after 2020
RMSE 13.61%
MAE 10.29%


# Random Forest

In [9]:
rf = RandomForestRegressor(n_estimators=250,
                           criterion='squared_error',
                           max_depth=9,
                           max_features='auto',
                           random_state=20)

rf.fit(X_train, y_train.values.ravel())
rf_y_pred = rf.predict(X_test)

print("Dataset with data before 2021")
print("RMSE: {:.2%}".format(np.sqrt(mean_squared_error(y_test, rf_y_pred))))
print("MAE", "{:.2%}".format(mean_absolute_error(y_test, rf_y_pred)))

rf_y_pred_21up = rf.predict(X_21up)
print("\nDataset with data after 2020")
print("RMSE", "{:.2%}".format(
    np.sqrt(mean_squared_error(y_21up, rf_y_pred_21up))))
print("MAE", "{:.2%}".format(mean_absolute_error(y_21up, rf_y_pred_21up)))


Dataset with data before 2021
RMSE: 26.24%
MAE 17.05%

Dataset with data after 2020
RMSE 11.25%
MAE 9.13%


Code below is to export the trained model using the `pickle` library, so that it can be loaded into another Notebook.

In [None]:
# import pickle
# pickle.dump(rf, open('ml_model', 'wb'))