## Random Forest:

- #### Training on the Complete data set which is 90-10 hold out split and further (70-15-15 internal validation and test splitting)
- #### Also executing on the unseen data - with the Data -of merged_train_5_MOSFET and merged_test_1_MOSFET csv files to check the generalisation and how it performs in the untrained completly new MOSFET

First executing the Step 1: Complete training data execution - 70-15-15 split using the train_for_model.csv 
and the merged_test_with_features (hold_out_data) and then further save the predictions nad the models 

In [3]:
# src/train/random_forest_train.py
# Usage (PowerShell): $env:PYTHONPATH="."; python Classical_Models\RF\random_forest_train.py

import os
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# === CONFIGURATION ===
FAST_MODE = False
TRAIN_FILE = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\data\processed\train_for_model.csv"
HOLDOUT_FILE = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\data\processed\merged_test_with_features.csv"
MODEL_DIR = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\Classical_Models\models\random_forest"
os.makedirs(MODEL_DIR, exist_ok=True)

TARGET_COLUMNS = [
    'voltage_rise_time_pulse1', 'voltage_rise_time_pulse2',
    'voltage_fall_time_pulse1', 'voltage_fall_time_pulse2',
    'current_rise_time_pulse1', 'current_rise_time_pulse2',
    'current_fall_time_pulse1', 'current_fall_time_pulse2',
    'overshoot_pulse_1', 'overshoot_pulse_2',
    'undershoot_pulse_1', 'undershoot_pulse_2',
    'ringing_frequency_MHz'
]
DROP_COLUMNS = ['DeviceID']

# === 1. Load Data ===
df = pd.read_csv(TRAIN_FILE)
df.dropna(subset=TARGET_COLUMNS, inplace=True)

if FAST_MODE:
    df = df.sample(frac=0.25, random_state=42)

X = df.drop(columns=DROP_COLUMNS + TARGET_COLUMNS)
y = df[TARGET_COLUMNS]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# === 2. Scaling ===
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled = scaler_y.transform(y_val)
y_test_scaled = scaler_y.transform(y_test)

joblib.dump(scaler_X, os.path.join(MODEL_DIR, "input_scaler.pkl"))
joblib.dump(scaler_y, os.path.join(MODEL_DIR, "output_scaler.pkl"))

# === 3. Train Model ===
n_estimators = 30 if FAST_MODE else 100
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1, random_state=42))

start_time = time.time()
model.fit(X_train_scaled, y_train_scaled)
print(f"\nTraining completed in {time.time() - start_time:.2f} seconds")

# === 4. Evaluation Function ===
def evaluate(model, X_scaled, y_scaled, label, save_name):
    y_pred_scaled = model.predict(X_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    y_true = scaler_y.inverse_transform(y_scaled)

    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput='raw_values'))
    r2 = r2_score(y_true, y_pred, multioutput='raw_values')

    print(f"\n{label} Evaluation:")
    print(f"{'Target Output':<35} {'RMSE':>15} {'R² Score':>12}")
    print("-" * 64)
    for i, col in enumerate(TARGET_COLUMNS):
        print(f"{col:<35} {rmse[i]:>15.2E} {r2[i]:>12.4f}")

    # Save actual vs predicted
    df_result = pd.DataFrame({f"{col}_actual": y_true[:, i] for i, col in enumerate(TARGET_COLUMNS)})
    for i, col in enumerate(TARGET_COLUMNS):
        df_result[f"{col}_predicted"] = y_pred[:, i]
    df_result.to_csv(os.path.join(MODEL_DIR, save_name), index=False)

    # Save metrics
    df_metrics = pd.DataFrame({
        "Target": TARGET_COLUMNS,
        "RMSE": [f"{val:.2E}" for val in rmse],
        "R2_Score": [round(val, 4) for val in r2]
    })
    metrics_file = save_name.replace(".csv", "_metrics.csv")
    df_metrics.to_csv(os.path.join(MODEL_DIR, metrics_file), index=False)

    print(f"Saved predictions to: {save_name}")
    print(f"Saved metrics to: {metrics_file}")
    return y_pred

# === 5. Evaluate
evaluate(model, X_val_scaled, y_val_scaled, "Validation", "validation_predictions.csv")
evaluate(model, X_test_scaled, y_test_scaled, "Internal Test", "internal_test_predictions.csv")

# === 6. External Holdout Evaluation ===
df_holdout = pd.read_csv(HOLDOUT_FILE)
df_holdout.dropna(subset=TARGET_COLUMNS, inplace=True)

X_holdout = df_holdout.drop(columns=DROP_COLUMNS + TARGET_COLUMNS)
y_holdout = df_holdout[TARGET_COLUMNS]

X_holdout_scaled = scaler_X.transform(X_holdout)
y_holdout_scaled = scaler_y.transform(y_holdout)

evaluate(model, X_holdout_scaled, y_holdout_scaled, "External Holdout", "holdout_predictions.csv")

# === 7. Save Model
joblib.dump(model, os.path.join(MODEL_DIR, "model.pkl"))
print(f"\nModel and scalers saved to: {MODEL_DIR}")


KeyboardInterrupt: 

Step 2: Do testing on the completely unseen data of a entirely new MOSFET - to check the generalisation and make improvements 