In [1]:
# import libraries
import pandas as pd
import numpy as np
import helpers
import matplotlib.pyplot as plt
import xgboost as xgb
import math

from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.ensemble import (
    BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor,
    AdaBoostRegressor, HistGradientBoostingRegressor
)
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, RidgeCV, ElasticNetCV, LassoCV,
    HuberRegressor, GammaRegressor, BayesianRidge, PoissonRegressor, LassoLars, 
    LassoLarsIC, LassoLarsCV, Lars, LarsCV, SGDRegressor, TweedieRegressor, RANSACRegressor,
    OrthogonalMatchingPursuitCV, OrthogonalMatchingPursuit, PassiveAggressiveRegressor
)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.dummy import DummyRegressor
from sklearn.kernel_ridge import KernelRidge
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# set plot theme
plt.style.use('ggplot')

# set dataframe display 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 10000)

In [2]:
# import training data

train_df = pd.read_csv('../data/train.csv')

train_df = helpers.clean_headers(train_df)


print(train_df.head())

  sex  length  diameter  height     weight  shucked_weight  viscera_weight  shell_weight  age
0   I  1.5250    1.1750  0.3750  28.973189       12.728926        6.647958      8.348928    9
1   I  1.1000    0.8250  0.2750  10.418441        4.521745        2.324659      3.401940    8
2   M  1.3875    1.1125  0.3750  24.777463       11.339800        5.556502      6.662133    9
3   F  1.7000    1.4125  0.5000  50.660556       20.354941       10.991839     14.996885   11
4   I  1.2500    1.0125  0.3375  23.289114       11.977664        4.507570      5.953395    8


# Checking for multi-collinearity

In [3]:
# Checking for MULTICOLLINEARITY

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Add a constant term for the intercept
X_with_constant = add_constant(train_df.iloc[:,1:])

# Calculate VIF for each feature
vif = pd.DataFrame()
vif["Variable"] = X_with_constant.columns
vif["VIF"] = [variance_inflation_factor(X_with_constant.values, i) for i in range(X_with_constant.shape[1])]

print(vif)

         Variable        VIF
0           const  75.590550
1          length  50.036365
2        diameter  52.355138
3          height   7.811277
4          weight  78.053036
5  shucked_weight  25.301892
6  viscera_weight  17.968955
7    shell_weight  20.997071
8             age   2.170267


All features have a very high VIF, which means that we will have to engineer features and remove the raw features given by the dataset. Ideally, we want to reach a point where all features have a VIF of between 1-5.

# Feature Engineering

## 1. Surface Area

Assuming surface area to be elliptical, we can use the diameter and height to create just one feature.

In [4]:
def calc_surf_area(length, diameter):
    a = length/2
    b = diameter/2
    surf_area = math.pi * a * b
    return surf_area

In [5]:
train_df['surface_area'] = calc_surf_area(train_df['length'], train_df['diameter'])

In [6]:
train_df.drop(columns=['length', 'diameter'], inplace = True)
train_df.head()

Unnamed: 0,sex,height,weight,shucked_weight,viscera_weight,shell_weight,age,surface_area
0,I,0.375,28.973189,12.728926,6.647958,8.348928,9,1.407335
1,I,0.275,10.418441,4.521745,2.324659,3.40194,8,0.712749
2,M,0.375,24.777463,11.3398,5.556502,6.662133,9,1.212336
3,F,0.5,50.660556,20.354941,10.991839,14.996885,11,1.885937
4,I,0.3375,23.289114,11.977664,4.50757,5.953395,8,0.99402


## 2. Weight Ratios

Weight is likely the sum of shucked_weight, viscera_weight and shell_weight, thus there is definitely a high level of multicollinearity across these 4 variables. Instead of having these variables, we can calculate the ratio of each.

In [7]:
# Create a function to calculate ratios

def calc_ratio(weight, portion_weight):
    return portion_weight/weight

In [8]:
# Calculate ratios as new features

train_df['meat_ratio'] = calc_ratio(train_df['weight'], train_df['shucked_weight'])
train_df['viscera_ratio'] = calc_ratio(train_df['weight'], train_df['viscera_weight'])
train_df['shell_ratio'] = calc_ratio(train_df['weight'], train_df['shell_weight'])

In [9]:
# Drop original features

train_df.drop(columns=['weight', 'shucked_weight', 'viscera_weight', 'shell_weight'], inplace = True)
train_df.head()

Unnamed: 0,sex,height,age,surface_area,meat_ratio,viscera_ratio,shell_ratio
0,I,0.375,9,1.407335,0.439335,0.229452,0.28816
1,I,0.275,8,0.712749,0.434014,0.223129,0.326531
2,M,0.375,9,1.212336,0.457666,0.224256,0.268879
3,F,0.5,11,1.885937,0.401791,0.21697,0.296027
4,I,0.3375,8,0.99402,0.514303,0.193548,0.25563


In [10]:
## Checking for Multi-Collinearity again

# Add a constant term for the intercept
X_with_constant = add_constant(train_df.iloc[:,1:])

# Calculate VIF for each feature
vif = pd.DataFrame()
vif["Variable"] = X_with_constant.columns
vif["VIF"] = [variance_inflation_factor(X_with_constant.values, i) for i in range(X_with_constant.shape[1])]

print(vif)

        Variable         VIF
0          const  170.806737
1         height    6.715998
2            age    1.920014
3   surface_area    6.760852
4     meat_ratio    1.083541
5  viscera_ratio    1.014563
6    shell_ratio    1.181895


Now, height and surface_area still have a VIF of > 5. I want to experiment if height and surface_area and multi-collinear, we will drop height to see if it improves the VIF score.

In [11]:
# Drop height feature

train_df.drop(columns=['height'], inplace = True)
train_df.head()

Unnamed: 0,sex,age,surface_area,meat_ratio,viscera_ratio,shell_ratio
0,I,9,1.407335,0.439335,0.229452,0.28816
1,I,8,0.712749,0.434014,0.223129,0.326531
2,M,9,1.212336,0.457666,0.224256,0.268879
3,F,11,1.885937,0.401791,0.21697,0.296027
4,I,8,0.99402,0.514303,0.193548,0.25563


In [12]:
## Checking for Multi-Collinearity again

# Add a constant term for the intercept
X_with_constant = add_constant(train_df.iloc[:,1:])

# Calculate VIF for each feature
vif = pd.DataFrame()
vif["Variable"] = X_with_constant.columns
vif["VIF"] = [variance_inflation_factor(X_with_constant.values, i) for i in range(X_with_constant.shape[1])]

print(vif)

        Variable         VIF
0          const  163.251120
1            age    1.834304
2   surface_area    1.849151
3     meat_ratio    1.083046
4  viscera_ratio    1.014548
5    shell_ratio    1.174522


After dropping height, we have a good VIF score across all features. With this we will move on to modelling.

## Preparing train-val datasets

In [13]:
# Train dataset features and target variables

X = train_df.drop('age', axis=1)
y = train_df['age']

In [14]:
# Train-Val split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train shape: {X_train.shape}')
print(f'X_val shape: {X_val.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_val shape: {y_val.shape}')


X_train shape: (59240, 5)
X_val shape: (14811, 5)
y_train shape: (59240,)
y_val shape: (14811,)


# Normalising feature values

In [15]:
# normalize numerical variables and one hot encode categorical variables

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['surface_area', 'meat_ratio', 'viscera_ratio', 'shell_ratio']),
        ('cat', OneHotEncoder(drop='first'), ['sex'])
    ]
)

# apply transformations
X_train_sc = preprocessor.fit_transform(X_train)
X_val_sc = preprocessor.transform(X_val)

# Modelling

## Model 1: Linear Regression

### Model Fitting

In [16]:
# Instantiate the model
lr = LinearRegression()

# Fit the model
lr.fit(X_train_sc, y_train)

# Make predictions on the training set
y_train_pred = lr.predict(X_train_sc)

# Make predictions on the validation set
y_val_pred = lr.predict(X_val_sc)

### Model Evaluation

In [17]:
# Calculate evaluation metrics for training data
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate evaluation metrics for validation data
val_mae = mean_absolute_error(y_val, y_val_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

# Print evaluation metrics
print("Training Data Evaluation:")
print(f"Mean Absolute Error: {train_mae:.2f}")
print(f"Mean Squared Error: {train_mse:.2f}")
print(f"R-squared: {train_r2:.2f}")

print("\nValidation Data Evaluation:")
print(f"Mean Absolute Error: {val_mae:.2f}")
print(f"Mean Squared Error: {val_mse:.2f}")
print(f"R-squared: {val_r2:.2f}")

Training Data Evaluation:
Mean Absolute Error: 1.61
Mean Squared Error: 5.28
R-squared: 0.47

Validation Data Evaluation:
Mean Absolute Error: 1.63
Mean Squared Error: 5.43
R-squared: 0.47


The Linear Regression model doesn't perform very well, with a high MAE score and a low R-squared score, despite the scores being balanced on both train and validation sets.

My next step will be to explore cross validation and regularization methods to improve the model's performance

In [18]:
# Perform cross-validation
cv_scores = cross_val_score(lr, X_train_sc, y_train, cv=5, scoring='r2')

# Print cross-validation results
print(f"Cross-Validation R-squared scores: {cv_scores}")
print(f"Average Cross-Validation R-squared: {cv_scores.mean():.2f}")

Cross-Validation R-squared scores: [0.47422281 0.4771258  0.47259864 0.47814143 0.46758992]
Average Cross-Validation R-squared: 0.47


In [19]:
# Define the Ridge and Lasso models
ridge = Ridge()
lasso = Lasso()

# Define parameter grids for both models
param_grid_ridge = {'alpha': [20.0, 30.0, 40.0]}
param_grid_lasso = {'alpha': [0.00001, 0.0001, 0.001]}

# Perform grid search with cross-validation for Ridge
ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='r2')
ridge_cv.fit(X_train_sc, y_train)

# Perform grid search with cross-validation for Lasso
lasso_cv = GridSearchCV(lasso, param_grid_lasso, cv=5, scoring='r2')
lasso_cv.fit(X_train_sc, y_train)

# Print the best parameters and scores
print(f"Best Ridge alpha: {ridge_cv.best_params_['alpha']}")
print(f"Best Ridge R-squared: {ridge_cv.best_score_:.2f}")

print(f"Best Lasso alpha: {lasso_cv.best_params_['alpha']}")
print(f"Best Lasso R-squared: {lasso_cv.best_score_:.2f}")

Best Ridge alpha: 20.0
Best Ridge R-squared: 0.47
Best Lasso alpha: 0.0001
Best Lasso R-squared: 0.47


Even with Cross Validation and Regularization techniques applied, the Linear Regression Model still fails to perform. Thus we will move on to other Regression models below.

## Other Models

There are too many models to test systematically, thus we will define a function to run through multiple models in a for loop to get a first look at all the models' performance

In [20]:
# Define a function to calculate adjusted R squared
def adjusted_r2(r2, n, k):
    return 1 - ((1 - r2) * (n - 1)) / (n - k - 1)

In [21]:
# Define a function to evaluate the model and return the metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    print(f"Evaluating {model.__class__.__name__}...")
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    n_train = X_train.shape[0]
    n_test = X_test.shape[0]
    k = X_train.shape[1]
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    train_adj_r2 = adjusted_r2(train_r2, n_train, k)
    test_adj_r2 = adjusted_r2(test_r2, n_test, k)
    
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    print(f"Evaluation complete for {model.__class__.__name__}.")
    
    return {
        "Model": model.__class__.__name__,
        "Train R squared": train_r2,
        "Test R squared": test_r2,
        "Train Adjusted R squared": train_adj_r2,
        "Test Adjusted R squared": test_adj_r2,
        "Train MAE": train_mae,
        "Test MAE": test_mae
    }

### Models: 
SVR, BaggingRegressor, NuSVR, RandomForestRegressor, XGBRegressor, GradientBoostingRegressor, ExtraTreesRegressor

In [22]:
# Define the models to evaluate
models = [
    SVR(),
    BaggingRegressor(),
    NuSVR(),
    RandomForestRegressor(),
    XGBRegressor(),
    GradientBoostingRegressor(),
    ExtraTreesRegressor()
]

# Initialize a list to store the results
results = []

In [23]:
# Iterate through the models and evaluate each one
for model in models:
    result = evaluate_model(model, X_train_sc, X_val_sc, y_train, y_val)
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

Evaluating SVR...
Evaluation complete for SVR.
Evaluating BaggingRegressor...
Evaluation complete for BaggingRegressor.
Evaluating NuSVR...
Evaluation complete for NuSVR.
Evaluating RandomForestRegressor...
Evaluation complete for RandomForestRegressor.
Evaluating XGBRegressor...
Evaluation complete for XGBRegressor.
Evaluating GradientBoostingRegressor...
Evaluation complete for GradientBoostingRegressor.
Evaluating ExtraTreesRegressor...
Evaluation complete for ExtraTreesRegressor.
                       Model  Train R squared  Test R squared  Train Adjusted R squared  Test Adjusted R squared  Train MAE  Test MAE
0                        SVR         0.549571        0.553625                  0.549526                 0.553444   1.393965  1.405248
1           BaggingRegressor         0.907744        0.507225                  0.907735                 0.507025   0.627659  1.572471
2                      NuSVR         0.556982        0.560426                  0.556937                 0.560

### Models: 
AdaBoostRegressor, HistGradientBoostingRegressor, PoissonRegressor, LGBMRegressor, KNeighborsRegressor, DecisionTreeRegressor, MLPRegressor 

In [24]:
models_2 = [
    AdaBoostRegressor(),
    HistGradientBoostingRegressor(),
    PoissonRegressor(),
    LGBMRegressor(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    MLPRegressor(max_iter=500)  # Adjust max_iter as needed
]

# Initialize a list to store the results
results_2 = []

In [25]:
# Iterate through the models and evaluate each one
for model in models_2:
    result = evaluate_model(model, X_train_sc, X_val_sc, y_train, y_val)
    results_2.append(result)

# Create a DataFrame from the results
results2_df = pd.DataFrame(results_2)

# Print the results
print(results2_df)

Evaluating AdaBoostRegressor...
Evaluation complete for AdaBoostRegressor.
Evaluating HistGradientBoostingRegressor...
Evaluation complete for HistGradientBoostingRegressor.
Evaluating PoissonRegressor...
Evaluation complete for PoissonRegressor.
Evaluating LGBMRegressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1024
[LightGBM] [Info] Number of data points in the train set: 59240, number of used features: 6
[LightGBM] [Info] Start training from score 9.970172
Evaluation complete for LGBMRegressor.
Evaluating KNeighborsRegressor...
Evaluation complete for KNeighborsRegressor.
Evaluating DecisionTreeRegressor...
Evaluation complete for DecisionTreeRegressor.
Evaluating MLPRegressor...
Evaluation complete for MLPRegressor.
                           Model  Train R squared  Test R squared  Train Adjusted R squared  Test Adjusted R squared  

### Models: 
HuberRegressor, GammaRegressor, LinearSVR, RidgeCV, BayesianRidge, Ridge, ElasticNetCV, LassoCV

In [26]:
models_3 = [
    HuberRegressor(),
    GammaRegressor(),
    LinearSVR(),
    RidgeCV(),
    BayesianRidge(),
    Ridge(),
    ElasticNetCV(),
    LassoCV()
]

# Initialize a list to store the results
results_3 = []

# Iterate through the models and evaluate each one
for model in models_3:
    result = evaluate_model(model, X_train_sc, X_val_sc, y_train, y_val)
    results_3.append(result)

# Create a DataFrame from the results
results3_df = pd.DataFrame(results_3)

# Print the results
print(results3_df)

Evaluating HuberRegressor...
Evaluation complete for HuberRegressor.
Evaluating GammaRegressor...
Evaluation complete for GammaRegressor.
Evaluating LinearSVR...
Evaluation complete for LinearSVR.
Evaluating RidgeCV...
Evaluation complete for RidgeCV.
Evaluating BayesianRidge...
Evaluation complete for BayesianRidge.
Evaluating Ridge...
Evaluation complete for Ridge.
Evaluating ElasticNetCV...
Evaluation complete for ElasticNetCV.
Evaluating LassoCV...
Evaluation complete for LassoCV.
            Model  Train R squared  Test R squared  Train Adjusted R squared  Test Adjusted R squared  Train MAE  Test MAE
0  HuberRegressor         0.448755        0.443445                  0.448699                 0.443219   1.550769  1.563197
1  GammaRegressor         0.354389        0.352309                  0.354323                 0.352046   1.788950  1.804691
2       LinearSVR         0.440044        0.436148                  0.439987                 0.435920   1.548354  1.560440
3         RidgeCV 

### Models: 
LassoLarsIC, LassoLarsCV, Lars, LarsCV, SGDRegressor, TweedieRegressor, RANSACRegressor

In [27]:
# List of models to evaluate
models_4 = [
    LassoLarsIC(criterion='aic'),
    LassoLarsCV(cv=5),
    Lars(),
    LarsCV(cv=5),
    SGDRegressor(),
    TweedieRegressor(),
    RANSACRegressor(random_state=0)  # Remove base_estimator=None
]

results_4 = []

# Iterate through the models and evaluate each one
for model in models_4:
    result = evaluate_model(model, X_train_sc, X_val_sc, y_train, y_val)
    results_4.append(result)

# Create a DataFrame from the results
results4_df = pd.DataFrame(results_4)

# Print the results
print(results4_df)

Evaluating LassoLarsIC...
Evaluation complete for LassoLarsIC.
Evaluating LassoLarsCV...
Evaluation complete for LassoLarsCV.
Evaluating Lars...
Evaluation complete for Lars.
Evaluating LarsCV...
Evaluation complete for LarsCV.
Evaluating SGDRegressor...
Evaluation complete for SGDRegressor.
Evaluating TweedieRegressor...
Evaluation complete for TweedieRegressor.
Evaluating RANSACRegressor...
Evaluation complete for RANSACRegressor.
              Model  Train R squared  Test R squared  Train Adjusted R squared  Test Adjusted R squared  Train MAE  Test MAE
0       LassoLarsIC         0.474587        0.466948                  0.474534                 0.466732   1.612497  1.627154
1       LassoLarsCV         0.474587        0.466948                  0.474534                 0.466732   1.612497  1.627154
2              Lars         0.474587        0.466948                  0.474534                 0.466732   1.612497  1.627154
3            LarsCV         0.474587        0.466948           

### Models: 
OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor, OrthogonalMatchingPursuit, ExtraTreeRegressor, DummyRegressor, LassoLars, Catboost

In [29]:
# List of models to evaluate
models_5 = [
    OrthogonalMatchingPursuitCV(),
    OrthogonalMatchingPursuit(),
    PassiveAggressiveRegressor(max_iter=1000, random_state=42),
    DummyRegressor(strategy='mean'),
    LassoLars(alpha=0.1),
    CatBoostRegressor(iterations=1000,  # Number of trees (iterations)
                          learning_rate=0.1,  # Learning rate
                          depth=6,  # Depth of trees
                          loss_function='RMSE',  # Loss function to optimize
                          random_seed=42)  # Random seed for reproducibility
]

results_5 = []

# Iterate through the models and evaluate each one
for model in models_5:
    result = evaluate_model(model, X_train_sc, X_val_sc, y_train, y_val)
    results_5.append(result)

# Create a DataFrame from the results
results5_df = pd.DataFrame(results_5)

# Print the results
print(results5_df)

Evaluating OrthogonalMatchingPursuitCV...
Evaluation complete for OrthogonalMatchingPursuitCV.
Evaluating OrthogonalMatchingPursuit...
Evaluation complete for OrthogonalMatchingPursuit.
Evaluating PassiveAggressiveRegressor...
Evaluation complete for PassiveAggressiveRegressor.
Evaluating DummyRegressor...
Evaluation complete for DummyRegressor.
Evaluating LassoLars...
Evaluation complete for LassoLars.
Evaluating CatBoostRegressor...
0:	learn: 3.0191532	total: 60.8ms	remaining: 1m
1:	learn: 2.8878617	total: 63ms	remaining: 31.4s
2:	learn: 2.7749601	total: 65.5ms	remaining: 21.8s
3:	learn: 2.6776342	total: 67.6ms	remaining: 16.8s
4:	learn: 2.5938084	total: 69.5ms	remaining: 13.8s
5:	learn: 2.5239129	total: 71.6ms	remaining: 11.9s
6:	learn: 2.4630254	total: 73.7ms	remaining: 10.5s
7:	learn: 2.4111661	total: 75.9ms	remaining: 9.41s
8:	learn: 2.3682239	total: 78ms	remaining: 8.59s
9:	learn: 2.3312515	total: 80.2ms	remaining: 7.94s
10:	learn: 2.2987253	total: 83ms	remaining: 7.46s
11:	lear

In [30]:
# Concatenate the DataFrames vertically
concatenated_df = pd.concat([results_df, results2_df, results3_df, results4_df, results5_df], ignore_index=True)

# Print the concatenated DataFrame
print(concatenated_df)

                            Model  Train R squared  Test R squared  Train Adjusted R squared  Test Adjusted R squared  Train MAE  Test MAE
0                             SVR         0.549571        0.553625                  0.549526                 0.553444   1.393965  1.405248
1                BaggingRegressor         0.907744        0.507225                  0.907735                 0.507025   0.627659  1.572471
2                           NuSVR         0.556982        0.560426                  0.556937                 0.560248   1.403508  1.413038
3           RandomForestRegressor         0.932733        0.542807                  0.932726                 0.542621   0.566542  1.509389
4                    XGBRegressor         0.656742        0.556128                  0.656708                 0.555948   1.298965  1.471864
5       GradientBoostingRegressor         0.572543        0.569150                  0.572500                 0.568975   1.429928  1.450187
6             ExtraTreesReg

## Shortlisting Models

In [35]:
# Filter rows where both 'Train MAE' and 'Test MAE' are less than 1.5
# shortlist_models = concatenated_df[(concatenated_df['Train MAE'] < 1.5) & (concatenated_df['Test MAE'] < 1.5)]

shortlist_models = concatenated_df[concatenated_df['Train MAE'] < 1.4]
print(shortlist_models)

                    Model  Train R squared  Test R squared  Train Adjusted R squared  Test Adjusted R squared  Train MAE  Test MAE
0                     SVR         0.549571        0.553625                  0.549526                 0.553444   1.393965  1.405248
1        BaggingRegressor         0.907744        0.507225                  0.907735                 0.507025   0.627659  1.572471
3   RandomForestRegressor         0.932733        0.542807                  0.932726                 0.542621   0.566542  1.509389
4            XGBRegressor         0.656742        0.556128                  0.656708                 0.555948   1.298965  1.471864
6     ExtraTreesRegressor         0.997723        0.528290                  0.997723                 0.528099   0.012272  1.531947
10          LGBMRegressor         0.596959        0.568985                  0.596918                 0.568811   1.393031  1.446836
11    KNeighborsRegressor         0.654837        0.490612                  0.65480

Based on the performances above, we will move on to further finetune the models below:

1. BaggingRegressor
2. RandomForest
3. XGBoost
4. Extra Trees
5. CatBoost

# Model Finetuning

### Model 1: Bagging Regressor

In [40]:
# Define parameter grid
param_grid = {
    'n_estimators': [150, 175, 200],
    'max_samples': [0.01, 0.05, 0.1],
    'bootstrap': [True],
    'bootstrap_features': [False]
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=BaggingRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',  # Optimised for MAE
                           cv=5,
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_sc, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01, n_estimators=150;, score=-1.471 total time=   0.4s
[CV 2/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01, n_estimators=150;, score=-1.462 total time=   0.4s
[CV 3/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01, n_estimators=150;, score=-1.481 total time=   0.4s
[CV 4/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01, n_estimators=150;, score=-1.466 total time=   0.4s
[CV 5/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01, n_estimators=150;, score=-1.465 total time=   0.4s
[CV 1/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01, n_estimators=175;, score=-1.468 total time=   0.5s
[CV 2/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01, n_estimators=175;, score=-1.460 total time=   0.5s
[CV 3/5] END bootstrap=True, bootstrap_features=False, max_samples=0.01,

In [41]:
# Get best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative score back to positive MAE
print(f"Best parameters: {best_params}")
print(f"Best score (Mean Absolute Error): {best_score}")

# Evaluate on validation set
y_pred = grid_search.best_estimator_.predict(X_val_sc)
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Absolute Error on validation set: {mae}")

Best parameters: {'bootstrap': True, 'bootstrap_features': False, 'max_samples': 0.05, 'n_estimators': 175}
Best score (Mean Absolute Error): 1.461762740747886
Mean Absolute Error on validation set: 1.4626322778115366


### Model 2: Random Forest Regressor

In [45]:
# Define parameter grid
param_grid = {
    'n_estimators': [225, 250],
    'max_depth': [10],
    'min_samples_split': [10],
    'min_samples_leaf': [4],
    'max_features': ['sqrt']
}

# Setup GridSearchCV with MAE as scoring metric
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',  # Use neg_mean_absolute_error for GridSearchCV
                           cv=5,  # Example cross-validation
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_sc, y_train)

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative score back to positive MAE
print(f"Best parameters: {best_params}")
print(f"Best score (Mean Absolute Error): {best_score}")

# Evaluate on validation set
y_pred = grid_search.best_estimator_.predict(X_val_sc)
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Absolute Error on validation set: {mae}")

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=225; total time=   6.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=225; total time=   6.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=225; total time=   6.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=225; total time=   6.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=225; total time=   6.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=250; total time=   7.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=250; total time=   7.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimat

### Model 3: XGBoost Regressor

In [48]:
# Define parameter grid for XGBRegressor
param_grid = {
    'n_estimators': [100],
    'max_depth': [5, 7],
    'learning_rate': [0.05],
    'subsample': [0.4, 0.6],
    'colsample_bytree': [0.8],
    'gamma': [5],
    'reg_alpha': [1],
    'reg_lambda': [2]
}

# Setup GridSearchCV with MAE as scoring metric
grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42, objective='reg:squarederror'),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',  # Use neg_mean_absolute_error for GridSearchCV
                           cv=5,  # Example cross-validation
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_sc, y_train)

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative score back to positive MAE
print(f"Best parameters: {best_params}")
print(f"Best score (Mean Absolute Error): {best_score}")

# Evaluate on validation set
y_pred = grid_search.best_estimator_.predict(X_val_sc)
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Absolute Error on validation set: {mae}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.05, max_depth=5, n_estimators=100, reg_alpha=1, reg_lambda=2, subsample=0.4; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.05, max_depth=5, n_estimators=100, reg_alpha=1, reg_lambda=2, subsample=0.4; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.05, max_depth=5, n_estimators=100, reg_alpha=1, reg_lambda=2, subsample=0.4; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.05, max_depth=5, n_estimators=100, reg_alpha=1, reg_lambda=2, subsample=0.4; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.05, max_depth=5, n_estimators=100, reg_alpha=1, reg_lambda=2, subsample=0.4; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.05, max_depth=5, n_estimators=100, reg_alpha=1, reg_lambda=2, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.

## Model 4: Extra Trees

In [50]:
# 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
# Define parameter grid for ExtraTreesRegressor
param_grid = {
    'n_estimators': [200, 250],
    'max_depth': [20, 25],
    'min_samples_split': [10, 15],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [False]
}

# Setup GridSearchCV with MAE as scoring metric
grid_search = GridSearchCV(estimator=ExtraTreesRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',  # Use neg_mean_absolute_error for GridSearchCV
                           cv=5,  # Example cross-validation
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_sc, y_train)

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative score back to positive MAE
print(f"Best parameters: {best_params}")
print(f"Best score (Mean Absolute Error): {best_score}")

# Evaluate on validation set
y_pred = grid_search.best_estimator_.predict(X_val_sc)
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Absolute Error on validation set: {mae}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   2.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   2.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   2.4s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   2.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   2.4s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=250; total time=   3.0s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimator

## Model 5: Cat Boost

In [53]:
# Best parameters: {'border_count': 50, 'depth': 8, 'iterations': 200, 'l2_leaf_reg': 3, 'learning_rate': 0.03}
# Best parameters: {'border_count': 64, 'depth': 8, 'iterations': 250, 'l2_leaf_reg': 3, 'learning_rate': 0.03}

# Define parameter grid for CatBoostRegressor
param_grid = {
    'iterations': [250, 300],
    'depth': [8],
    'learning_rate': [0.03],
    'l2_leaf_reg': [3],
    'border_count': [64, 100]
}

# Setup GridSearchCV with MAE as scoring metric
grid_search = GridSearchCV(estimator=CatBoostRegressor(random_state=42, silent=True),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',  # Use neg_mean_absolute_error for GridSearchCV
                           cv=5,  # Example cross-validation
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_sc, y_train)

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative score back to positive MAE
print(f"Best parameters: {best_params}")
print(f"Best score (Mean Absolute Error): {best_score}")

# Evaluate on validation set
y_pred = grid_search.best_estimator_.predict(X_val_sc)
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Absolute Error on validation set: {mae}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END border_count=64, depth=8, iterations=250, l2_leaf_reg=3, learning_rate=0.03; total time=   0.6s
[CV] END border_count=64, depth=8, iterations=250, l2_leaf_reg=3, learning_rate=0.03; total time=   0.6s
[CV] END border_count=64, depth=8, iterations=250, l2_leaf_reg=3, learning_rate=0.03; total time=   0.7s
[CV] END border_count=64, depth=8, iterations=250, l2_leaf_reg=3, learning_rate=0.03; total time=   0.6s
[CV] END border_count=64, depth=8, iterations=250, l2_leaf_reg=3, learning_rate=0.03; total time=   0.6s
[CV] END border_count=64, depth=8, iterations=300, l2_leaf_reg=3, learning_rate=0.03; total time=   0.7s
[CV] END border_count=64, depth=8, iterations=300, l2_leaf_reg=3, learning_rate=0.03; total time=   0.7s
[CV] END border_count=64, depth=8, iterations=300, l2_leaf_reg=3, learning_rate=0.03; total time=   0.8s
[CV] END border_count=64, depth=8, iterations=300, l2_leaf_reg=3, learning_rate=0.03; total time=   

# Preparing test datasets

In [None]:
# Test dataset dataframe

test_df = pd.read_csv('../data/test.csv')

test_df = helpers.clean_headers(test_df)

In [None]:
# Test dataset feature engineering

test_df['surface_area'] = calc_surf_area(test_df['length'], test_df['diameter']) # Surface area
test_df['meat_ratio'] = calc_ratio(test_df['weight'], test_df['shucked_weight']) # Meat ratio
test_df['viscera_ratio'] = calc_ratio(test_df['weight'], test_df['viscera_weight']) # Viscera ratio
test_df['shell_ratio'] = calc_ratio(test_df['weight'], test_df['shell_weight']) # Shell ratio

# Drop all original features to match train dataset

test_df.drop(columns=['length', 'diameter', 'weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'height'], inplace = True)

test_df.head()

In [None]:
# Data pre-processing

# apply transformations
scaled_test_data = preprocessor.transform(test_df)

# get column headers
column_names = preprocessor.get_feature_names_out()

# Convert transformed data back to Dataframe
scaled_test = pd.DataFrame(scaled_test_data, columns=column_names)

print(scaled_test.head()) # debug