In [None]:
import numpy as np
import scipy as sp
from scipy import stats
import pandas as pd
from pandas.api.types import CategoricalDtype
import time

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv('rawdata.csv', delimiter=';')

In [None]:
SS = StandardScaler()
df_trans = SS.transform(df)

In [None]:
model = PCA(n_components=1)
df['PCA'] = model.fit_transform(df.iloc[:,2:4].values)

In [None]:
X = df.drop(labels='Lifetime', axis=1)
y = df['Lifetime']

In [None]:
models = {
    "LR": Ridge(),
    "SVM": SVR(kernel='rbf'),
    "DT": DecisionTreeRegressor(random_state=123),
    "RF": RandomForestRegressor(n_estimators=10, random_state=123),
    "XGB": XGBRegressor(objective='reg:squarederror', n_estimators=10, seed=123)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'r2': 'r2',
    'neg_mse': 'neg_mean_squared_error',
    'neg_rmse': make_scorer(lambda y_true, y_pred: -np.sqrt(mean_squared_error(y_true, y_pred))),
    'neg_mae': 'neg_mean_absolute_error'
}

results_all = []

for name, model in models.items():
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring)

    for s in scores['test_r2']:
        results_all.append({'Model': name, 'Metric': 'R2', 'Score': s})
    for s in scores['test_neg_rmse']:
        results_all.append({'Model': name, 'Metric': 'RMSE', 'Score': -s})
    for s in scores['test_neg_mse']:
        results_all.append({'Model': name, 'Metric': 'MSE', 'Score': -s})
    for s in scores['test_neg_mae']:
        results_all.append({'Model': name, 'Metric': 'MAE', 'Score': -s})

df_results = pd.DataFrame(results_all)


fig, axes = plt.subplots(4, 1, figsize=(8, 12), sharex=False)
metrics = ['R2', 'RMSE', 'MSE', 'MAE']

for i, metric in enumerate(metrics):
    sns.boxplot(
        data=df_results[df_results['Metric'] == metric],
        x='Score',
        y='Model',
        ax=axes[i],
        palette='Set2'
    )
    axes[i].set_title(metric)
    axes[i].grid(True, axis='x')

plt.tight_layout()
plt.show()

# Single Evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)

def evaluate_model(model, name):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    mae = np.mean(np.abs(y_test - y_pred))

    elapsed_time = time.time() - start_time
    print(f"{name:<20} =>  R²: {r2:.3f}  RMSE: {rmse:.3f}  MSE: {mse:.3f}  MAE: {mae:.3f}  | Time: {elapsed_time:.2f}s")


models = {
    "Linear Regression": LinearRegression(),
    "SVM": SVR(),
    "Decision Tree": DecisionTreeRegressor(random_state=123),
    "Random Forest": RandomForestRegressor(random_state=123),
    "XGBoost": XGBRegressor(random_state=123)
}

for name, model in models.items():
    evaluate_model(model, name)

## Hyperparameter Optimization

In [None]:
start_time = time.time()

xgb_regressor = XGBRegressor(objective='reg:squarederror',
                             n_estimators=300,
                             random_state=123)

param = {
    'learning_rate': [i/10 for i in range(1, 11)],
    'max_depth': list(range(1, 21)),
    'reg_alpha': list(range(1, 11)),
    'reg_lambda': list(range(1, 11))
}


grid_search = GridSearchCV(estimator=xgb_regressor,
                           param_grid=param,
                           scoring='neg_mean_squared_error',
                           cv=5,
                           n_jobs=-1)
random_search = RandomizedSearchCV(estimator=xgb_regressor,
                                   param_distributions=param,
                                   scoring='neg_mean_squared_error',
                                   n_iter=50,
                                   cv=5,
                                   n_jobs=-1,
                                   random_state=123)
bayes_search = BayesSearchCV(estimator=xgb_regressor,
                             search_spaces=param,
                             scoring='neg_mean_squared_error',
                             n_iter=50,
                             cv=5,
                             n_jobs=-1,
                             random_state=123)

grid_search.fit(X_train, y_train)
random_search.fit(X_train, y_train)
bayes_search.fit(X_train, y_train)

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, name):
    start = time.time()
    best_model = model.best_estimator_
    best_model.set_params(n_estimators=1000, early_stopping_rounds=10)

    best_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    y_pred = best_model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    mae = np.mean(np.abs(y_test - y_pred))
    elapsed = time.time() - start

    return {
        "Method": name,
        "R²": round(r2, 3),
        "RMSE": round(rmse, 3),
        "MSE": round(mse, 3),
        "MAE": round(mae, 3),
        "Exec_Time (s)": round(elapsed, 2)
    }


print("Running Grid Search...")
grid_search.fit(X_train, y_train)
print("Running Random Search...")
random_search.fit(X_train, y_train)
print("Running Bayesian Optimization...")
bayes_search.fit(X_train, y_train)

results = []
results.append(evaluate_model(grid_search, X_train, y_train, X_test, y_test, "Grid Search"))
results.append(evaluate_model(random_search, X_train, y_train, X_test, y_test, "Random Search"))
results.append(evaluate_model(bayes_search, X_train, y_train, X_test, y_test, "Bayesian Optimization"))

results_df = pd.DataFrame(results)
print("Model Evaluation Summary")
print(results_df.to_string(index=False))


In [None]:
plot_results = {
    "Grid": [r2],
    "Random": [results_df.loc[results_df["Method"] == "Random Search", "R²"].values[0]],
    "Bayes": [results_df.loc[results_df["Method"] == "Bayesian Optimization", "R²"].values[0]]
}

labels = ["XGB"]

fig, axes = plt.subplots(1, 3, figsize=(12, 6), sharey=True)

for i, method in enumerate(plot_results.keys()):
    axes[i].boxplot(plot_results[method])
    axes[i].set_title(method)
    axes[i].set_xticklabels(labels, rotation=45)
    axes[i].set_ylabel("R² Score")

plt.tight_layout()
plt.show()