<a href="https://colab.research.google.com/github/thegayankalinga/see_model_implementation/blob/main/see_implementation_v5_p2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📌 Phase 2: Traditional Machine Learning Model Implementation


### Install & Import Required Libraries

In [10]:
# Install necessary libraries (if not installed)
!pip install pandas numpy scikit-learn xgboost matplotlib seaborn joblib --quiet

# Import required libraries
# Import required libraries
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Define file paths
DATA_PATH = "/content/drive/MyDrive/Projects/msc_project/results_data/"
PERFORMANCE_DATA = "/content/drive/MyDrive/Projects/msc_project/performance_data/"
MODELS_PATH = "/content/drive/MyDrive/Projects/msc_project/models/"

# Ensure Google Drive is mounted
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load preproceessed training, validation & test data

In [2]:
# Load datasets
X_train = pd.read_csv(os.path.join(DATA_PATH, "X_train.csv"))
X_val = pd.read_csv(os.path.join(DATA_PATH, "X_val.csv"))
X_test = pd.read_csv(os.path.join(DATA_PATH, "X_test.csv"))
y_train = pd.read_csv(os.path.join(DATA_PATH, "y_train.csv"))
y_val = pd.read_csv(os.path.join(DATA_PATH, "y_val.csv"))
y_test = pd.read_csv(os.path.join(DATA_PATH, "y_test.csv"))

# Display dataset shapes
print(f"X_train Shape: {X_train.shape}, y_train Shape: {y_train.shape}")
print(f"X_val Shape: {X_val.shape}, y_val Shape: {y_val.shape}")
print(f"X_test Shape: {X_test.shape}, y_test Shape: {y_test.shape}")

# Display feature sample
X_train.head()

X_train Shape: (35000, 41), y_train Shape: (35000, 4)
X_val Shape: (7500, 41), y_val Shape: (7500, 4)
X_test Shape: (7500, 41), y_test Shape: (7500, 4)


Unnamed: 0,region,dev_environment,sit_environment,uat_environment,staging_environment,training_environment,production_environment,dr_environment,compliance_pci_sff,compliance_country_specific,...,tps_required,warranty_months,no_of_functional_modules,no_of_none_functional_modules,uat_cycles,test_coverage,rest_integration_points,soap_integration_points,iso8583_integration_points,sdk_integration_points
0,2,0,0,0,0,0,0,0,0,1,...,-1.790662,-0.655402,1.080574,1.012685,-0.44441,1.348596,-0.833149,1.173808,-0.621911,-0.620713
1,2,0,1,0,0,0,0,0,0,0,...,0.017691,-0.655402,-0.686674,-1.003552,-1.338956,1.348596,-1.031342,-0.690523,1.019882,-0.620713
2,0,0,1,0,0,0,0,1,0,0,...,1.309371,-0.655402,-0.486608,0.157312,-0.44441,0.452424,-0.337667,-0.379802,-0.621911,1.021125
3,2,0,1,0,0,0,0,1,0,0,...,0.792699,-0.655402,-1.486937,-0.087081,-1.338956,0.452424,-0.238571,0.863086,-0.621911,-0.620713
4,2,0,1,0,0,0,0,0,0,1,...,-1.015654,1.525781,-1.220182,-1.858925,-0.44441,1.348596,1.446068,0.863086,-0.621911,1.021125


### Implement Baseline Models (RF & XGB)

#### Random Forest (RF) Baseline Model

In [3]:
# Initialize Random Forest for multi-output regression
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_val)

# Evaluate the model
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Avoid division by zero
    mmre = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10)))  # Add epsilon to avoid zero division

    print(f"\n📌 {model_name} Model Evaluation:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R-Squared: {r2:.4f}")
    print(f"MAPE: {mape:.2f}%")
    print(f"MMRE: {mmre:.4f}")

# Evaluate Random Forest
evaluate_model(y_val, y_pred_rf, "Random Forest")


📌 Random Forest Model Evaluation:
MSE: 0.1376
RMSE: 0.3710
R-Squared: 0.8632
MAPE: 1026.43%
MMRE: 10.2643


#### XGBoost Baseline Model

In [4]:
# Initialize XGBoost for multi-output regression
xgb_model = XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_val)

# Evaluate XGBoost
evaluate_model(y_val, y_pred_xgb, "XGBoost")


📌 XGBoost Model Evaluation:
MSE: 0.0179
RMSE: 0.1336
R-Squared: 0.9822
MAPE: 453.01%
MMRE: 4.5301


### Hyper Parameter Tuning

#### Tune Random Forest (RF)

In [5]:
# Define hyperparameter grid

#Due to time constraints of the Colab resources not using the following
# rf_params = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, 30, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }


rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2,]
}

# Perform Randomized Search
rf_search = RandomizedSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1),
                               param_distributions=rf_params,
                               n_iter=10, cv=3, verbose=2, n_jobs=-1)

rf_search.fit(X_train, y_train)

# Best parameters
print("\n📌 Best RF Hyperparameters:", rf_search.best_params_)

# Train with best parameters
best_rf = rf_search.best_estimator_
y_pred_rf_best = best_rf.predict(X_val)

# Evaluate tuned RF
evaluate_model(y_val, y_pred_rf_best, "Optimized Random Forest")

Fitting 3 folds for each of 10 candidates, totalling 30 fits

📌 Best RF Hyperparameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20}

📌 Optimized Random Forest Model Evaluation:
MSE: 0.1361
RMSE: 0.3689
R-Squared: 0.8647
MAPE: 992.53%
MMRE: 9.9252


#### Tune XGBoost

In [7]:
# Define hyperparameter grid

# xgb_params = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0]
# }

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.05, ],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}

# Perform Randomized Search
xgb_search = RandomizedSearchCV(XGBRegressor(objective="reg:squarederror", random_state=42),
                                param_distributions=xgb_params,
                                n_iter=10, cv=3, verbose=2, n_jobs=-1)

xgb_search.fit(X_train, y_train)

# Best parameters
print("\n📌 Best XGBoost Hyperparameters:", xgb_search.best_params_)

# Train with best parameters
best_xgb = xgb_search.best_estimator_
y_pred_xgb_best = best_xgb.predict(X_val)

# Evaluate tuned XGBoost
evaluate_model(y_val, y_pred_xgb_best, "Optimized XGBoost")

Fitting 3 folds for each of 10 candidates, totalling 30 fits

📌 Best XGBoost Hyperparameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.8}

📌 Optimized XGBoost Model Evaluation:
MSE: 0.0260
RMSE: 0.1612
R-Squared: 0.9741
MAPE: 471.86%
MMRE: 4.7186


### Save the Best Model

In [11]:
# Save Model Performance Data to a Single CSV File
performance_data = {
    "Model": ["Baseline Random Forest", "Baseline XGBoost", "Optimized Random Forest", "Optimized XGBoost"],
    "MSE": [mse_rf, mse_xgb, mse_rf_best, mse_xgb_best],
    "RMSE": [rmse_rf, rmse_xgb, rmse_rf_best, rmse_xgb_best],
    "R-Squared": [r2_rf, r2_xgb, r2_rf_best, r2_xgb_best],
    "MAPE (%)": [mape_rf, mape_xgb, mape_rf_best, mape_xgb_best],
    "MMRE": [mmre_rf, mmre_xgb, mmre_rf_best, mmre_xgb_best]
}

performance_df = pd.DataFrame(performance_data)
performance_csv_path = os.path.join(PERFORMANCE_DATA, "ml_model_performance_comparison.csv")
performance_df.to_csv(performance_csv_path, index=False)

print(f"\n✅ Model performance data saved to {performance_csv_path}")

# Save Best Models
joblib.dump(best_rf, os.path.join(MODELS_PATH, "best_random_forest.pkl"))
joblib.dump(best_xgb, os.path.join(MODELS_PATH, "best_xgboost.pkl"))

print("\n✅ Best models saved successfully in MODELS_PATH.")


✅ Model performance data saved to /content/drive/MyDrive/Projects/msc_project/performance_data/model_performance_comparison.csv

✅ Best models saved successfully in MODELS_PATH.
