In [6]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# --- Load dataset ---
data = pd.read_csv("steel.csv")

# Quick look at data
print(data.shape)
data.head()


(553, 10)


Unnamed: 0,normalising_temperature,tempering_temperature,percent_silicon,percent_chromium,percent_copper,percent_nickel,percent_sulphur,percent_carbon,percent_manganese,tensile_strength
0,178.5,275,0.153,0.970575,0.942,0.887,0.0,1.92,0.0,25.107613
1,178.5,950,0.153,1.212726,0.942,0.887,0.0,1.92,0.0,140.035334
2,178.5,375,0.153,1.621165,0.942,0.887,0.0,1.92,0.0,42.21765
3,178.5,900,0.153,0.809989,0.942,0.887,0.0,1.92,0.0,95.015309
4,189.525,900,0.1624,1.036229,0.849,0.9382,0.0,2.035,0.0,113.266773


In [9]:
# CT4101 Assignment 2 - RandomForestRegressor

# The below code is the standard one that does nto use hyperparameters manually tuned. 
# All hyper/parameters used in this block uses the default values per th library. 

import pandas as pd
from sklearn.model_selection import KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import numpy as np

# 1. load the data (make sure steel.csv is in the same folder as the notebook)
data = pd.read_csv("steel.csv")

# 2. features (X) and target (y)
X = data.drop(columns=["tensile_strength"])
y = data["tensile_strength"]

# 3. define model with default params (you will later tune n_estimators, max_depth etc.)
rf = RandomForestRegressor(
    random_state=42
)

# 4. set up 10-fold CV
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 5. define scorers
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

scoring = {
    "rmse": make_scorer(rmse, greater_is_better=False),  # will be negative, we'll flip sign
    "r2": "r2"
}

# 6. run CV
cv_results = cross_validate(
    rf,
    X,
    y,
    cv=kf,
    scoring=scoring,
    return_train_score=True
)

# 7. print averages
mean_train_rmse = -cv_results["train_rmse"].mean()
mean_test_rmse = -cv_results["test_rmse"].mean()
mean_train_r2 = cv_results["train_r2"].mean()
mean_test_r2 = cv_results["test_r2"].mean()

print("RandomForestRegressor (No Hyperparams)")
print(f"Avg TRAIN RMSE: {mean_train_rmse:.3f}")
print(f"Avg TEST  RMSE: {mean_test_rmse:.3f}")
print(f"Avg TRAIN R2 : {mean_train_r2:.3f}")
print(f"Avg TEST  R2 : {mean_test_r2:.3f}")


RandomForestRegressor (No Hyperparams)
Avg TRAIN RMSE: 10.934
Avg TEST  RMSE: 28.276
Avg TRAIN R2 : 0.986
Avg TEST  R2 : 0.896


In [8]:
#This model contains the manual tuning of hyperparameters. 

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


param_grid = {
    "n_estimators": [50, 100, 200, 300],   # number of trees
    "max_depth": [None, 5, 10, 15, 20]     # tree depth
}

rf_base = RandomForestRegressor(random_state=42)

# Grid search with 10-fold check (grid search is what checks for the best value? or does something automatically that can usually be done manualy)
grid_rf = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=kf,                                   # same 10-fold split
    scoring={"rmse": make_scorer(rmse, greater_is_better=False),
             "r2": "r2"},
    refit="rmse",                            # best model = lowest RMSE
    return_train_score=True,
    n_jobs=-1
    

)

grid_rf.fit(X, y)

print("Best choice for Hyperparameters:", grid_rf.best_params_)

best_rf = grid_rf.best_estimator_

# re-check best model with 10-fold CV
cv_best = cross_validate(
    best_rf,
    X,
    y,
    cv=kf,
    scoring=scoring,
    return_train_score=True
)

best_train_rmse = -cv_best["train_rmse"].mean()
best_test_rmse  = -cv_best["test_rmse"].mean()
best_train_r2   = cv_best["train_r2"].mean()
best_test_r2    = cv_best["test_r2"].mean()

print("\nRandomForestRegressor (tuned)")
print(f"Avg TRAIN RMSE: {best_train_rmse:.3f}")
print(f"Avg TEST  RMSE: {best_test_rmse:.3f}")
print(f"Avg TRAIN R2 : {best_train_r2:.3f}")
print(f"Avg TEST  R2 : {best_test_r2:.3f}")


Best choice for Hyperparameters: {'max_depth': None, 'n_estimators': 300}

RandomForestRegressor (tuned)
Avg TRAIN RMSE: 10.667
Avg TEST  RMSE: 28.071
Avg TRAIN R² : 0.986
Avg TEST  R² : 0.897


In [10]:
import pandas as pd

# Convert all results to a data-frame
results_df = pd.DataFrame(grid_rf.cv_results_)

# only including relevant columsn 
cols_to_show = [
    "param_n_estimators",
    "param_max_depth",
    "mean_test_rmse",
    "mean_test_r2",
    "rank_test_rmse"
]

results_df = results_df[cols_to_show]

# RMSE was negative (because "greater_is_better=False"), so fix the sign:
results_df["mean_test_rmse"] = results_df["mean_test_rmse"].abs()

# Sort by best RMSE (rank 1 is best)
results_df = results_df.sort_values("rank_test_rmse")

results_df


Unnamed: 0,param_n_estimators,param_max_depth,mean_test_rmse,mean_test_r2,rank_test_rmse
3,300,,28.070871,0.89704,1
19,300,20.0,28.071908,0.897034,2
15,300,15.0,28.10869,0.896832,3
18,200,20.0,28.108866,0.896982,4
2,200,,28.109203,0.896978,5
14,200,15.0,28.15593,0.896749,6
1,100,,28.275983,0.895853,7
17,100,20.0,28.279979,0.895851,8
13,100,15.0,28.317277,0.89554,9
11,300,10.0,28.485846,0.894018,10


In [11]:
# using differen hyperparameters

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
import numpy as np

# hyperparameter grid 
param_grid_2 = {
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": [2, 5, 10, 20]
}

rf_base_2 = RandomForestRegressor(random_state=42)

grid_rf_2 = GridSearchCV(
    estimator=rf_base_2,
    param_grid=param_grid_2,
    cv=kf,   # same 10-fold CV
    scoring={
        "rmse": make_scorer(rmse, greater_is_better=False),
        "r2": "r2"
    },
    refit="rmse",
    return_train_score=True,
    n_jobs=-1
)

grid_rf_2.fit(X, y)

print("Best choice of hyperparameters (grid 2):", grid_rf_2.best_params_)

best_rf_2 = grid_rf_2.best_estimator_

cv_best_2 = cross_validate(
    best_rf_2,
    X,
    y,
    cv=kf,
    scoring=scoring,
    return_train_score=True
)

best2_train_rmse = -cv_best_2["train_rmse"].mean()
best2_test_rmse  = -cv_best_2["test_rmse"].mean()
best2_train_r2   = cv_best_2["train_r2"].mean()
best2_test_r2    = cv_best_2["test_r2"].mean()

print("\nRandomForestRegressor (tuned – grid 2)")
print(f"Avg TRAIN RMSE: {best2_train_rmse:.3f}")
print(f"Avg TEST  RMSE: {best2_test_rmse:.3f}")
print(f"Avg TRAIN R2 : {best2_train_r2:.3f}")
print(f"Avg TEST  R2 : {best2_test_r2:.3f}")


Best choice of hyperparameters (grid 2): {'max_features': None, 'min_samples_split': 2}

RandomForestRegressor (tuned – grid 2)
Avg TRAIN RMSE: 10.934
Avg TEST  RMSE: 28.276
Avg TRAIN R2 : 0.986
Avg TEST  R2 : 0.896


In [12]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
import numpy as np

# KNN baseline model
# StandardScaler, scales features to resolve the likes of temperature vs small value fetures
# KNeighborsRegressor() with the default hyperparameters 
knn_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor())
])

# 10 fold cros validation using the same kf & scoring as Random Forest
cv_knn = cross_validate(
    knn_pipe,
    X,
    y,
    cv=kf,                 
    scoring=scoring,       
    return_train_score=True
)

# Averaging the metrics across folds
knn_train_rmse = -cv_knn["train_rmse"].mean()
knn_test_rmse  = -cv_knn["test_rmse"].mean()
knn_train_r2   = cv_knn["train_r2"].mean()
knn_test_r2    = cv_knn["test_r2"].mean()

print("KNN Regressor (baseline/default hyperparams)")
print(f"Avg TRAIN RMSE: {knn_train_rmse:.3f}")
print(f"Avg TEST  RMSE: {knn_test_rmse:.3f}")
print(f"Avg TRAIN R2 : {knn_train_r2:.3f}")
print(f"Avg TEST  R2 : {knn_test_r2:.3f}")


KNN Regressor (baseline/default hyperparams)
Avg TRAIN RMSE: 34.308
Avg TEST  RMSE: 42.656
Avg TRAIN R2 : 0.858
Avg TEST  R2 : 0.766


In [14]:
# Hyperparameter tuning model to compare to baseline.

from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

# Hyperparameter grid for knn
param_grid_knn = {
    "knn__n_neighbors": [1, 3, 5, 7, 9, 11, 15, 20], #test 8 values ofg k
    "knn__p": [1, 2]  # 1 = Manhattan, 2 = Euclidean, testing both, double underscore for "look inside" meaning due to pipeline from above 
}

# Pipeline scaling + KNN, no leakage of data invalidating resdults. Automatically apply scaling
knn_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor())
])

# Grid search with the same k fold setup
grid_knn = GridSearchCV(
    estimator=knn_pipe,
    param_grid=param_grid_knn,
    cv=kf,
    scoring={
        "rmse": make_scorer(rmse, greater_is_better=False),
        "r2": "r2"
    },
    refit="rmse", 
    return_train_score=True,
    n_jobs=-1
)

# Fit the grid search
grid_knn.fit(X, y)

print("Best KNN hyperparams:", grid_knn.best_params_)

# Evaluate the model with 10 fold 
best_knn = grid_knn.best_estimator_

cv_knn_best = cross_validate(
    best_knn,
    X,
    y,
    cv=kf,
    scoring=scoring,
    return_train_score=True
)

best_knn_train_rmse = -cv_knn_best["train_rmse"].mean()
best_knn_test_rmse  = -cv_knn_best["test_rmse"].mean()
best_knn_train_r2   = cv_knn_best["train_r2"].mean()
best_knn_test_r2    = cv_knn_best["test_r2"].mean()

print("\nKNN Regressor (tuned)")
print(f"Avg TRAIN RMSE: {best_knn_train_rmse:.3f}")
print(f"Avg TEST  RMSE: {best_knn_test_rmse:.3f}")
print(f"Avg TRAIN R2 : {best_knn_train_r2:.3f}")
print(f"Avg TEST  R2 : {best_knn_test_r2:.3f}")


Best KNN hyperparams: {'knn__n_neighbors': 3, 'knn__p': 1}

KNN Regressor (tuned)
Avg TRAIN RMSE: 29.381
Avg TEST  RMSE: 42.495
Avg TRAIN R2 : 0.896
Avg TEST  R2 : 0.771
