In [1]:
# Usage: $env:PYTHONPATH="."; python Classical_Models\XGBoost\xgboost_train.py

import os
import time
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# === CONFIG ===
FAST_MODE = True
TRAIN_FILE = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\data\processed\train_for_model.csv"
HOLDOUT_FILE = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\data\processed\merged_test_with_features.csv"
MODEL_DIR = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\Classical_Models\models\xgboost\testing"
os.makedirs(MODEL_DIR, exist_ok=True)

TARGET_COLUMNS = [
    'voltage_rise_time_pulse1', 'voltage_rise_time_pulse2',
    'voltage_fall_time_pulse1', 'voltage_fall_time_pulse2',
    'current_rise_time_pulse1', 'current_rise_time_pulse2',
    'current_fall_time_pulse1', 'current_fall_time_pulse2',
    'overshoot_pulse_1', 'overshoot_pulse_2',
    'undershoot_pulse_1', 'undershoot_pulse_2',
    'ringing_frequency_MHz'
]
DROP_COLUMNS = ['DeviceID']

# === 1. Load & Sample ===
df = pd.read_csv(TRAIN_FILE)
df.dropna(subset=TARGET_COLUMNS, inplace=True)
if 'Device' in df.columns:
    df = df.drop(columns=['Device'])

if FAST_MODE:
    df = df.sample(frac=0.25, random_state=42)

X = df.drop(columns=DROP_COLUMNS + TARGET_COLUMNS)
y = df[TARGET_COLUMNS]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# === 2. Scale ===
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled = scaler_y.transform(y_val)
y_test_scaled = scaler_y.transform(y_test)

joblib.dump(scaler_X, os.path.join(MODEL_DIR, "input_scaler.pkl"))
joblib.dump(scaler_y, os.path.join(MODEL_DIR, "output_scaler.pkl"))

# === 3. Train XGBoost ===
xgb_params = {
    "n_estimators": 100,
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_jobs": -1,
    "verbosity": 0
}
model = MultiOutputRegressor(XGBRegressor(**xgb_params))
start_time = time.time()
model.fit(X_train_scaled, y_train_scaled)
print(f"Training completed in {time.time() - start_time:.2f} seconds")

# === 4. Evaluate ===
def evaluate(model, X_scaled, y_scaled, label, save_name):
    y_pred_scaled = model.predict(X_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    y_true = scaler_y.inverse_transform(y_scaled)

    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput='raw_values'))
    r2 = r2_score(y_true, y_pred, multioutput='raw_values')

    print(f"\n{label} Evaluation:")
    print(f"{'Target Output':<35} {'RMSE':>15} {'R² Score':>12}")
    print("-" * 64)
    for i, col in enumerate(TARGET_COLUMNS):
        print(f"{col:<35} {rmse[i]:.2E} {r2[i]:>12.4f}")

    # Save predictions
    df_result = pd.DataFrame({f"{col}_actual": y_true[:, i] for i, col in enumerate(TARGET_COLUMNS)})
    for i, col in enumerate(TARGET_COLUMNS):
        df_result[f"{col}_predicted"] = y_pred[:, i]
    df_result.to_csv(os.path.join(MODEL_DIR, save_name), index=False)

    # Save metrics
    df_metrics = pd.DataFrame({
        "Target": TARGET_COLUMNS,
        "RMSE": [f"{val:.2E}" for val in rmse],
        "R2_Score": [round(val, 4) for val in r2]
    })
    metrics_path = os.path.join(MODEL_DIR, save_name.replace(".csv", "_metrics.csv"))
    df_metrics.to_csv(metrics_path, index=False)
    print(f"Saved predictions to: {save_name}")
    print(f"Saved metrics to: {metrics_path}")

# === 5. Evaluate Splits ===
evaluate(model, X_val_scaled, y_val_scaled, "Validation", "validation_predictions.csv")
evaluate(model, X_test_scaled, y_test_scaled, "Internal Test", "internal_test_predictions.csv")

# === 6. Holdout Test ===
df_holdout = pd.read_csv(HOLDOUT_FILE)
df_holdout.dropna(subset=TARGET_COLUMNS, inplace=True)
if 'Device' in df_holdout.columns:
    df_holdout = df_holdout.drop(columns=['Device'])

X_holdout = df_holdout.drop(columns=DROP_COLUMNS + TARGET_COLUMNS)
y_holdout = df_holdout[TARGET_COLUMNS]
X_holdout_scaled = scaler_X.transform(X_holdout)
y_holdout_scaled = scaler_y.transform(y_holdout)
evaluate(model, X_holdout_scaled, y_holdout_scaled, "External Holdout", "holdout_predictions.csv")

# === 7. Save Model ===
joblib.dump(model, os.path.join(MODEL_DIR, "model.pkl"))
print(f"\nModel and scalers saved to: {MODEL_DIR}")


Training completed in 7.93 seconds

Validation Evaluation:
Target Output                                  RMSE     R² Score
----------------------------------------------------------------
voltage_rise_time_pulse1            3.77E-08      -0.0057
voltage_rise_time_pulse2            2.04E-07       0.9998
voltage_fall_time_pulse1            3.72E-08       1.0000
voltage_fall_time_pulse2            7.31E-08       0.9999
current_rise_time_pulse1            9.25E-07       0.9932
current_rise_time_pulse2            1.82E-07       0.9211
current_fall_time_pulse1            5.86E-06       0.7367
current_fall_time_pulse2            2.75E-08       0.5473
overshoot_pulse_1                   9.88E-01       0.9986
overshoot_pulse_2                   3.29E+00       0.9999
undershoot_pulse_1                  4.55E+00       0.9691
undershoot_pulse_2                  1.06E+01       0.9311
ringing_frequency_MHz               1.51E+00       0.9989
Saved predictions to: validation_predictions.csv
Saved me

In [3]:
# Usage: $env:PYTHONPATH="."; python Classical_Models\XGBoost\xgboost_generalization.py

import os
import time
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# === CONFIG ===
FAST_MODE = True

TRAIN_FILE = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\data\raw\merged_train_5_MOSFETS.csv"
TEST_FILE = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\data\raw\merged_test_1_MOSFET.csv"

MODEL_DIR = r"C:\Users\pc\Desktop\PROJECT_THESIS_Thrisha_Rajkumar\Classical_Models\models\xgboost\generalization"
os.makedirs(MODEL_DIR, exist_ok=True)

TARGET_COLUMNS = [
    'voltage_rise_time_pulse1', 'voltage_rise_time_pulse2',
    'voltage_fall_time_pulse1', 'voltage_fall_time_pulse2',
    'current_rise_time_pulse1', 'current_rise_time_pulse2',
    'current_fall_time_pulse1', 'current_fall_time_pulse2',
    'overshoot_pulse_1', 'overshoot_pulse_2',
    'undershoot_pulse_1', 'undershoot_pulse_2',
    'ringing_frequency_MHz'
]
DROP_COLUMNS = ['DeviceID']

# === 1. Load & Sample Train Data ===
df_train = pd.read_csv(TRAIN_FILE)
df_train.dropna(subset=TARGET_COLUMNS, inplace=True)
if 'Device' in df_train.columns:
    df_train = df_train.drop(columns=['Device'])

if FAST_MODE:
    df_train = df_train.sample(frac=0.25, random_state=42)

X_train = df_train.drop(columns=DROP_COLUMNS + TARGET_COLUMNS)
y_train = df_train[TARGET_COLUMNS]

# === 2. Load Test Data (Unseen MOSFET) ===
df_test = pd.read_csv(TEST_FILE)
df_test.dropna(subset=TARGET_COLUMNS, inplace=True)
if 'Device' in df_test.columns:
    df_test = df_test.drop(columns=['Device'])

X_test = df_test.drop(columns=DROP_COLUMNS + TARGET_COLUMNS)
y_test = df_test[TARGET_COLUMNS]

# === 3. Scaling ===
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

joblib.dump(scaler_X, os.path.join(MODEL_DIR, "input_scaler.pkl"))
joblib.dump(scaler_y, os.path.join(MODEL_DIR, "output_scaler.pkl"))

# === 4. Train XGBoost ===
xgb_params = {
    "n_estimators": 100,
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_jobs": -1,
    "verbosity": 0
}
model = MultiOutputRegressor(XGBRegressor(**xgb_params))

start_time = time.time()
model.fit(X_train_scaled, y_train_scaled)
print(f"Training completed in {time.time() - start_time:.2f} seconds")

# === 5. Evaluate ===
def evaluate(model, X_scaled, y_scaled, label, save_name):
    y_pred_scaled = model.predict(X_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    y_true = scaler_y.inverse_transform(y_scaled)

    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput='raw_values'))
    r2 = r2_score(y_true, y_pred, multioutput='raw_values')

    print(f"\n{label} Evaluation:")
    print(f"{'Target Output':<35} {'RMSE':>15} {'R² Score':>12}")
    print("-" * 64)
    for i, col in enumerate(TARGET_COLUMNS):
        print(f"{col:<35} {rmse[i]:.2E} {r2[i]:>12.4f}")

    # Save predictions
    df_result = pd.DataFrame({f"{col}_actual": y_true[:, i] for i, col in enumerate(TARGET_COLUMNS)})
    for i, col in enumerate(TARGET_COLUMNS):
        df_result[f"{col}_predicted"] = y_pred[:, i]
    df_result.to_csv(os.path.join(MODEL_DIR, save_name), index=False)

    # Save metrics
    df_metrics = pd.DataFrame({
        "Target": TARGET_COLUMNS,
        "RMSE": [f"{val:.2E}" for val in rmse],
        "R2_Score": [round(val, 4) for val in r2]
    })
    metrics_path = os.path.join(MODEL_DIR, save_name.replace(".csv", "_metrics.csv"))
    df_metrics.to_csv(metrics_path, index=False)
    print(f"Saved predictions to: {save_name}")
    print(f"Saved metrics to: {metrics_path}")

# === 6. Evaluate Generalization ===
evaluate(model, X_test_scaled, y_test_scaled, "Generalization (Unseen MOSFET)", "unseen_predictions.csv")

# === 7. Save Model ===
joblib.dump(model, os.path.join(MODEL_DIR, "model.pkl"))
print(f"\nModel and scalers saved to: {MODEL_DIR}")


Training completed in 10.40 seconds

Generalization (Unseen MOSFET) Evaluation:
Target Output                                  RMSE     R² Score
----------------------------------------------------------------
voltage_rise_time_pulse1            2.21E-08      -0.0494
voltage_rise_time_pulse2            3.91E-07       0.0010
voltage_fall_time_pulse1            1.84E-08     -31.2556
voltage_fall_time_pulse2            8.48E-08       0.3072
current_rise_time_pulse1            8.98E-07       0.5955
current_rise_time_pulse2            8.80E-07      -0.1691
current_fall_time_pulse1            7.72E-06       0.3353
current_fall_time_pulse2            2.97E-08      -0.3047
overshoot_pulse_1                   2.13E+01       0.2962
overshoot_pulse_2                   1.72E+01       0.8268
undershoot_pulse_1                  2.71E+01       0.1009
undershoot_pulse_2                  3.92E+01       0.1674
ringing_frequency_MHz               2.49E+01     -12.9040
Saved predictions to: unseen_predict