## Imports

In [109]:
import optuna
import pandas as pd
from networkx.classes import non_neighbors
from optuna import trial

# Data Set
from sklearn.datasets import fetch_california_housing

# Data Management, metrics and model selection
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


#Plots
import matplotlib.pyplot as plt

#Other
import  numpy as np
import pandas as pd

## Data and Data Split

In [89]:
seed = 42
data = fetch_california_housing()

x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=seed)
y_train, y_test = y_train.reshape(-1, 1), y_test.reshape(-1, 1)


# Data standardization
scaler = StandardScaler()
x_train, y_train =scaler.fit(x_train).transform(x_train), scaler.fit(y_train).transform(y_train).ravel()
x_test, y_test =  scaler.fit(x_test).transform(x_test), scaler.fit(y_test).transform(y_test).ravel()

## Metrics Calculator

In [90]:
def metric(y_hat, y_true,features_shape):
    n,p = features_shape
    r2 = r2_score(y_true, y_hat)
    socres = {
        "MAE": mean_absolute_error(y_true, y_hat),
        "MSE": mean_squared_error(y_true, y_hat),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_hat)),
        "R2": r2,
        "Adjusted R2":  1 - (1 - r2) * (n - 1) / (n - p - 1)
    }

    df_score = pd.DataFrame(socres.items(), columns=["Metric","Value"])

    return df_score


metric(y_test, y_test, x_train.shape)

Unnamed: 0,Metric,Value
0,MAE,0.0
1,MSE,0.0
2,RMSE,0.0
3,R2,1.0
4,Adjusted R2,1.0


# Regression Models without optimalization

In [91]:
def train_and_judge(model, x_train, y_train, x_test, y_test):
    model = model
    model.fit(x_train, y_train)
    y_hat_train = model.predict(x_train)
    y_hat_test = model.predict(x_test)

    features_shape = x_train.shape
    df_train = metric(y_hat_train, y_train, features_shape)

    features_shape = x_test.shape
    df_test = metric(y_hat_test, y_test, features_shape)

    print("Training Scores")
    print(df_train)
    print()
    print("Testing Scores")
    print(df_test)



## KNN Regresor

In [92]:
train_and_judge(KNeighborsRegressor(), x_train, y_train, x_test, y_test)

Training Scores
        Metric     Value
0          MAE  0.307235
1          MSE  0.203102
2         RMSE  0.450668
3           R2  0.796898
4  Adjusted R2  0.796800

Testing Scores
        Metric     Value
0          MAE  0.375238
1          MSE  0.306766
2         RMSE  0.553864
3           R2  0.693234
4  Adjusted R2  0.692639


## SVE

In [93]:
train_and_judge(SVR(), x_train, y_train, x_test, y_test)

Training Scores
        Metric     Value
0          MAE  0.329964
1          MSE  0.248902
2         RMSE  0.498901
3           R2  0.751098
4  Adjusted R2  0.750977

Testing Scores
        Metric     Value
0          MAE  0.650450
1          MSE  0.714309
2         RMSE  0.845168
3           R2  0.285691
4  Adjusted R2  0.284304


## Random Forest

In [94]:
train_and_judge(RandomForestRegressor(), x_train, y_train, x_test, y_test)

Training Scores
        Metric     Value
0          MAE  0.105605
1          MSE  0.026499
2         RMSE  0.162786
3           R2  0.973501
4  Adjusted R2  0.973488

Testing Scores
        Metric     Value
0          MAE  0.487719
1          MSE  0.471293
2         RMSE  0.686508
3           R2  0.528707
4  Adjusted R2  0.527792


## Extreme Gradient Boosting Regressor

In [95]:
train_and_judge(GradientBoostingRegressor(), x_train, y_train, x_test, y_test)

Training Scores
        Metric     Value
0          MAE  0.308455
1          MSE  0.195102
2         RMSE  0.441704
3           R2  0.804898
4  Adjusted R2  0.804803

Testing Scores
        Metric     Value
0          MAE  0.421844
1          MSE  0.326604
2         RMSE  0.571493
3           R2  0.673396
4  Adjusted R2  0.672761


# Model with optimalization

In [110]:
def optimize_train_and_judge(model_class, objective_fn, trails=100):
    global x_train, y_train, x_test, y_test

    study = optuna.create_study(direction="maximize")
    study.optimize(objective_fn, n_trials=trails, show_progress_bar=True)

    model = model_class(**study.best_params)
    model.fit(x_train, y_train)
    y_hat_train = model.predict(x_train)
    y_hat_test = model.predict(x_test)

    features_shape = x_train.shape
    df_train = metric(y_hat_train, y_train, features_shape)

    features_shape = x_test.shape
    df_test = metric(y_hat_test, y_test, features_shape)

    print("Training Scores")
    print(df_train)
    print()
    print("Testing Scores")
    print(df_test)




## KNN Regresor

In [111]:
def KNN_objective(trail):
    n_neighbors = trail.suggest_int("n_neighbors", 1, 80)
    weights = trail.suggest_categorical('weights', ['uniform', 'distance'])
    metric  = trail.suggest_categorical('metric', ['manhattan', 'euclidean','minkowski'])

    clf = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, metric=metric)
    clf.fit(x_train, y_train)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(clf, x_train, y_train, cv=cv, scoring= 'neg_mean_squared_error').mean()

    return score


optimize_train_and_judge(KNeighborsRegressor, KNN_objective)

[I 2025-11-16 13:22:08,779] A new study created in memory with name: no-name-b932dfb5-ab44-4fda-89a7-50141f3e7ccd


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-16 13:22:11,084] Trial 0 finished with value: -0.2974373890143788 and parameters: {'n_neighbors': 47, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: -0.2974373890143788.
[I 2025-11-16 13:22:12,455] Trial 1 finished with value: -0.33635322879447327 and parameters: {'n_neighbors': 62, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 0 with value: -0.2974373890143788.
[I 2025-11-16 13:22:14,299] Trial 2 finished with value: -0.2919563959617922 and parameters: {'n_neighbors': 37, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 2 with value: -0.2919563959617922.
[I 2025-11-16 13:22:15,739] Trial 3 finished with value: -0.33916344353889477 and parameters: {'n_neighbors': 70, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 2 with value: -0.2919563959617922.
[I 2025-11-16 13:22:16,923] Trial 4 finished with value: -0.3262317078988467 and parameters: {'n_neighbors': 39, 'weights': 'uniform', 'metric': 'euclidean'}. Best is

## SVR

In [None]:
def SVR_study(trail):
    C = trail.suggest_float("C", 0.01, 10)
    epsilon = trail.suggest_float("epsilon", 0.001, 0.05)
    kernel = trail.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid'])
    gamma = trail.suggest_float("gamma", 0.1, 10)


    clf = SVR(C=C, epsilon=epsilon, kernel=kernel, gamma=gamma)
    clf.fit(x_train, y_train)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(clf, x_train, y_train, cv=cv, scoring= 'neg_mean_squared_error').mean()

    return score

optimize_train_and_judge(SVR, SVR_study)

[I 2025-11-16 13:33:13,527] A new study created in memory with name: no-name-bfdd5b2a-f12d-437c-8e84-d537e57f902a


  0%|          | 0/100 [00:00<?, ?it/s]

## Random Forest