In [None]:
from optuna.visualization import plot_optimization_history, plot_intermediate_values
from sklearn import clone


def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import numpy as np
from code.wrapper import utils

import seaborn as sns
import optuna

import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import Ridge, RidgeCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.feature_selection import RFECV, RFE
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import cross_val_score

**Normalization** increases the speed of calculation and reduces the alpha value while also increasing the score
https://stats.stackexchange.com/a/189179

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
# https://stackoverflow.com/a/23835410

excel_sheet = pd.read_excel("../Data/New/unfiltered_data.xlsx", sheet_name=["full_train", "full_test",
                                                                               "ionizable_train", "ionizable_test",
                                                                               "neutral_train", "neutral_test"])


full_train: pd.DataFrame = excel_sheet["full_train"]
full_test: pd.DataFrame = excel_sheet["full_test"]

neutral_train: pd.DataFrame = excel_sheet["neutral_train"]
neutral_test: pd.DataFrame = excel_sheet["neutral_test"]

ionizable_train: pd.DataFrame = excel_sheet["ionizable_train"]
ionizable_test: pd.DataFrame = excel_sheet["ionizable_test"]

Scaler = RobustScaler()
Norm = Normalizer()
# TRAIN
X_full_train = full_train.loc[:, full_train.columns != "Log_MP_RATIO"]
y_full_train = full_train["Log_MP_RATIO"]


X_neutral_train = neutral_train.loc[:, neutral_train.columns != "Log_MP_RATIO"]
y_neutral_train = neutral_train["Log_MP_RATIO"]

X_ionizable_train = ionizable_train.loc[:, ionizable_train.columns != "Log_MP_RATIO"]
y_ionizable_train = ionizable_train["Log_MP_RATIO"]
# Scaler.fit(X_full_train)
# X_full_train = pd.DataFrame(Scaler.transform(X_full_train), columns = X_full_train.columns)
#
# Norm.fit(X_full_train)
# X_full_train = pd.DataFrame(Norm.transform(X_full_train), columns=X_full_train.columns)

# TEST
X_full_test = full_test.loc[:, full_test.columns != "Log_MP_RATIO"]
y_full_test = full_test["Log_MP_RATIO"]

X_neutral_test = neutral_test.loc[:, neutral_test.columns != "Log_MP_RATIO"]
y_neutral_test = neutral_test["Log_MP_RATIO"]

X_ionizable_test = ionizable_test.loc[:, ionizable_test.columns != "Log_MP_RATIO"]
y_ionizable_test = ionizable_test["Log_MP_RATIO"]
#
# Scaler.fit(X_full_test)
# X_full_test = pd.DataFrame(Scaler.transform(X_full_test), columns = X_full_test.columns)
#
# Norm.fit(X_full_test)
# X_full_test = pd.DataFrame(Norm.transform(X_full_test), columns=X_full_test.columns)

# Full

In [None]:
test_utils = utils.Utils(full_train)
test_utils.create_cv_folds(display=True)
test_utils.display_score(Ridge(), X_full_train, y_full_train, X_full_test, y_full_test)

In [None]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-10, 1e10, log=True)

    solver = trial.suggest_categorical('solver', ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])

    clf = Ridge(max_iter=100000, alpha=alpha, solver=solver)

    # n_ft = trial.suggest_int('n_ft', 1, 10, log=True)
    # clf = RFE(Ridge(max_iter=100000, alpha=alpha, solver=solver), n_features_to_select=n_ft)

    estimator = utils.Utils(full_train)
    # return cross_val_score(clf, X_full_train, y_full_train, cv=5, n_jobs=-1).mean()
    return estimator.cross_value_score(clf)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1, show_progress_bar=True)
trial = study.best_trial
print(trial.value, trial.params)

In [None]:
test_utils.display_score(Ridge(**study.best_params), X_full_train, y_full_train, X_full_test, y_full_test)
display(plot_optimization_history(study))

rr = Ridge(**study.best_params).fit(X_full_train, y_full_train)
y_full_train_pred = rr.predict(X_full_train)
y_full_test_pred = rr.predict(X_full_test)

test_utils.display_graph(rr, X_full_train, X_full_test, y_full_train, y_full_test)

# Ionizable

In [None]:
from code.wrapper import utils


test_utils = utils.Utils(ionizable_train)
test_utils.create_cv_folds(display=True)
test_utils.display_score(Ridge(), X_ionizable_train, y_ionizable_train, X_ionizable_test, y_ionizable_test)

In [None]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-10, 1e10, log=True)

    solver = trial.suggest_categorical('solver', ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])

    clf = Ridge(max_iter=100000, alpha=alpha, solver=solver)

    # n_ft = trial.suggest_int('n_ft', 1, 10, log=True)
    # clf = RFE(Ridge(max_iter=100000, alpha=alpha, solver=solver), n_features_to_select=n_ft)

    estimator = utils.Utils(ionizable_train)
    # return cross_val_score(clf, X_ionizable_train, y_ionizable_train, cv=5, n_jobs=-1).mean()
    return estimator.cross_value_score(clf)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1, show_progress_bar=True)
trial = study.best_trial
print(trial.value, trial.params)

In [None]:
test_utils.display_score(Ridge(**study.best_params), X_ionizable_train, y_ionizable_train, X_ionizable_test, y_ionizable_test)
plot: plt.Figure = plot_optimization_history(study, target_name="Score de validation croisée")


# grid_results = study.trials_dataframe()
# grid_results = grid_results[["value", "params_alpha"]].sort_values(by=["value"])
# grid_results = grid_results[grid_results["params_alpha"] < 20000]
#
# plot = sns.lineplot(data=grid_results, x="params_alpha", y="value")
# plot.set(xlabel="Alpha value", ylabel="Cross value")

# Neutral

In [None]:
from code.wrapper import utils


test_utils = utils.Utils(neutral_train)
test_utils.create_cv_folds(display=True)
test_utils.display_score(Ridge(), X_neutral_train, y_neutral_train, X_neutral_test, y_neutral_test)

In [None]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-10, 1e10, log=True)

    solver = trial.suggest_categorical('solver', ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])

    clf = Ridge(max_iter=100000, alpha=alpha, solver=solver)

    # n_ft = trial.suggest_int('n_ft', 1, 10, log=True)
    # clf = RFE(Ridge(max_iter=100000, alpha=alpha, solver=solver), n_features_to_select=n_ft)

    estimator = utils.Utils(neutral_train)
    # return cross_val_score(clf, X_neutral_train, y_neutral_train, cv=5, n_jobs=-1).mean()
    return estimator.cross_value_score(clf)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1, show_progress_bar=True)
trial = study.best_trial
print(trial.value, trial.params)

In [None]:
test_utils.display_score(Ridge(**study.best_params), X_ionizable_train, y_ionizable_train, X_ionizable_test, y_ionizable_test)
plot_optimization_history(study, target_name="Score de validation croisée")

# Previous work

In [None]:
rr = Ridge(max_iter=100000)
rr.fit(X_full_train, y_full_train)
cv = KFold(5)
min_features_to_select: int = 1

folds = KFold(n_splits=5, shuffle=True, random_state=100)


rfecv = RFECV(
    estimator=rr,
    step=1,
    cv=cv,
    scoring="r2",
    min_features_to_select=min_features_to_select,
    n_jobs=-1
)

rfecv.fit(X_full_train, y_full_train)

print("Optimal number of features: ", rfecv.n_features_)

n_scores= len(rfecv.cv_results_["mean_test_score"])
plt.figure()
plt.plot(range(min_features_to_select, n_scores+min_features_to_select), rfecv.cv_results_["mean_test_score"])
plt.xlabel("Number of feature selected")
plt.ylabel("Mean test of accuracy")
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()



print("best model\t:\t",rfecv.get_params(), "\n\tR2\t:\t", rfecv.score(X_full_train, y_full_train),
      "\n\tCross-val\t:\t", cross_val_score(rfecv, X_full_train, y_full_train, scoring="r2", cv=folds).mean(),
      "\n\tQ2\t:\t", rfecv.score(X_full_test, y_full_test),
      "\nSelect features:", rfecv.get_feature_names_out())

In [None]:
param_grid = {
    "estimator__alpha": np.linspace(1e-10, 100, 10),
    "estimator__solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"]
}

cv = KFold(5)

grid = GridSearchCV(estimator=best_rfe, param_grid=param_grid, scoring="r2",verbose=2, n_jobs=-1, cv=cv)
grid.fit(X_full_train, y_full_train)


print("Best cross-val score\t:\t", grid.best_score_)
print("Best hyperparam score\t:\t", grid.best_estimator_)
print("R2\t: ", grid.best_estimator_.score(X_full_train, y_full_train))
print("Q2\t: ", grid.best_estimator_.score(X_full_test, y_full_test))

In [None]:
grid_results = pd.DataFrame(grid.cv_results_)


grid_results = grid_results[grid_results["param_estimator__alpha"] < 2]

plot = sns.lineplot(data=grid_results, x="param_estimator__alpha", y="mean_test_score", hue="param_estimator__solver")
plot.set(xlabel="Alpha value", ylabel="Cross value")

In [None]:
param_grid = {
    "estimator__alpha": np.linspace(1e-5, 100, 1000),
    "estimator__solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"]
}
cv = KFold(5)

grid_cv = GridSearchCV(estimator=rfecv, param_grid=param_grid, scoring="r2",verbose=2, n_jobs=-1, cv=cv)
grid_cv.fit(X_full_train, y_full_train)


print("Best cross-val score\t:\t", grid_cv.best_score_)
print("Best hyperparam score\t:\t", grid_cv.best_estimator_)
print("R2\t: ", grid_cv.best_estimator_.score(X_full_train, y_full_train))
print("Q2\t: ", grid_cv.best_estimator_.score(X_full_test, y_full_test))

## Ionized

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer

# https://stackoverflow.com/a/23835410
ionizable_train = pd.read_csv("../../Data/Filtered/ionizable_train.csv", sep=";")
ionizable_test = pd.read_csv("../../Data/Filtered/ionizable_test.csv", sep=";")

Scaler = RobustScaler()
Norm = Normalizer()
# TRAIN
X_ionizable_train = ionizable_train.loc[:, ionizable_train.columns != "Log_MP_RATIO"]
y_ionizable_train = ionizable_train["Log_MP_RATIO"]
#
# Scaler.fit(X_ionizable_train)
# X_ionizable_train = pd.DataFrame(Scaler.transform(X_ionizable_train), columns = X_ionizable_train.columns)

Norm.fit(X_ionizable_train)
X_ionizable_train = pd.DataFrame(Norm.transform(X_ionizable_train), columns=X_ionizable_train.columns)

# TEST
X_ionizable_test = ionizable_test.loc[:, ionizable_test.columns != "Log_MP_RATIO"]
y_ionizable_test = ionizable_test["Log_MP_RATIO"]
#
# Scaler.fit(X_ionizable_test)
# X_ionizable_test = pd.DataFrame(Scaler.transform(X_ionizable_test), columns = X_ionizable_test.columns)

Norm.fit(X_ionizable_test)
X_ionizable_test = pd.DataFrame(Norm.transform(X_ionizable_test), columns=X_ionizable_test.columns)

In [None]:
rr = Ridge(max_iter=100000)
rr.fit(X_ionizable_train, y_ionizable_train)

folds = KFold(n_splits=5)

print("R2 train\t\t:\t", rr.score(X_ionizable_train, y_ionizable_train))
print("Cross-val train\t:\t", cross_val_score(Ridge(), X_ionizable_train, y_ionizable_train, cv=folds).mean())
print("Q2 test\t\t\t:\t", rr.score(X_ionizable_test, y_ionizable_test))

In [None]:
rr = Ridge(max_iter=100000)
rr.fit(X_ionizable_train, y_ionizable_train)

folds = KFold(n_splits=5)


hyper_params = [{"n_features_to_select": list(range(1,len(X_ionizable_train.columns)))}]

rfe = RFE(rr, step=1)

model_cv = GridSearchCV(estimator=rfe,
                        param_grid=hyper_params,
                        scoring="r2",
                        cv=folds,
                        verbose=2,
                        return_train_score=True,
                        n_jobs=-1)


model_cv.fit(X_ionizable_train, y_ionizable_train)

cv_results = pd.DataFrame(model_cv.cv_results_)

best_rfe = model_cv.best_estimator_


plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel("number of features")
plt.ylabel("R2")
plt.title("Optimal number of features")
plt.legend(["test score", "train score"], loc="upper left")
plt.show()


print("best model\t:\t",best_rfe.get_params(), "\n\tR2\t:\t", best_rfe.score(X_ionizable_train, y_ionizable_train),
      "\n\tCross-val\t:\t", cross_val_score(best_rfe, X_ionizable_train, y_ionizable_train, scoring="r2", cv=folds).mean(),
      "\n\tQ2\t:\t", best_rfe.score(X_ionizable_test, y_ionizable_test),
      "\nSelect features:", best_rfe.get_feature_names_out())

In [None]:
param_grid = {
    "estimator__alpha": np.linspace(1e-5, 100, 1000),
    "estimator__solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"]
}

cv = KFold(5)

grid = GridSearchCV(estimator=best_rfe, param_grid=param_grid, scoring="r2",verbose=2, n_jobs=-1, cv=cv)
grid.fit(X_ionizable_train, y_ionizable_train)


print("Best cross-val score\t:\t", grid.best_score_)
print("Best hyperparam score\t:\t", grid.best_estimator_)
print("R2\t: ", grid.best_estimator_.score(X_ionizable_train, y_ionizable_train))
print("Q2\t: ", grid.best_estimator_.score(X_ionizable_test, y_ionizable_test))

In [None]:
grid_results = pd.DataFrame(grid.cv_results_)


grid_results = grid_results[grid_results["param_estimator__alpha"] < 20]

plot = sns.lineplot(data=grid_results, x="param_estimator__alpha", y="mean_test_score", hue="param_estimator__solver")
plot.set(xlabel="Alpha value", ylabel="Cross value")

## Neutral

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer

# https://stackoverflow.com/a/23835410
neutral_train = pd.read_csv("../../Data/Filtered/neutral_train.csv", sep=";")
neutral_test = pd.read_csv("../../Data/Filtered/neutral_test.csv", sep=";")

Scaler = RobustScaler()
Norm = Normalizer()
# TRAIN
X_neutral_train = neutral_train.loc[:, neutral_train.columns != "Log_MP_RATIO"]
y_neutral_train = neutral_train["Log_MP_RATIO"]
#
# Scaler.fit(X_neutral_train)
# X_neutral_train = pd.DataFrame(Scaler.transform(X_neutral_train), columns = X_neutral_train.columns)

Norm.fit(X_neutral_train)
X_neutral_train = pd.DataFrame(Norm.transform(X_neutral_train), columns=X_neutral_train.columns)

# TEST
X_neutral_test = neutral_test.loc[:, neutral_test.columns != "Log_MP_RATIO"]
y_neutral_test = neutral_test["Log_MP_RATIO"]
#
# Scaler.fit(X_neutral_test)
# X_neutral_test = pd.DataFrame(Scaler.transform(X_neutral_test), columns = X_neutral_test.columns)

Norm.fit(X_neutral_test)
X_neutral_test = pd.DataFrame(Norm.transform(X_neutral_test), columns=X_neutral_test.columns)

In [None]:
rr = Ridge(max_iter=100000)
rr.fit(X_neutral_train, y_neutral_train)

folds = KFold(n_splits=5)

print("R2 train\t\t:\t", rr.score(X_neutral_train, y_neutral_train))
print("Cross-val train\t:\t", cross_val_score(Ridge(), X_neutral_train, y_neutral_train, cv=folds).mean())
print("Q2 test\t\t\t:\t", rr.score(X_neutral_test, y_neutral_test))

In [None]:
rr = Ridge(max_iter=100000)
rr.fit(X_neutral_train, y_neutral_train)

folds = KFold(n_splits=5)


hyper_params = [{"n_features_to_select": list(range(1,len(X_neutral_train.columns)))}]

rfe = RFE(rr, step=1)

model_cv = GridSearchCV(estimator=rfe,
                        param_grid=hyper_params,
                        scoring="r2",
                        cv=folds,
                        verbose=2,
                        return_train_score=True,
                        n_jobs=-1)


model_cv.fit(X_neutral_train, y_neutral_train)

cv_results = pd.DataFrame(model_cv.cv_results_)

best_rfe = model_cv.best_estimator_


plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel("number of features")
plt.ylabel("R2")
plt.title("Optimal number of features")
plt.legend(["test score", "train score"], loc="upper left")
plt.show()


print("best model\t:\t",best_rfe.get_params(), "\n\tR2\t:\t", best_rfe.score(X_neutral_train, y_neutral_train),
      "\n\tCross-val\t:\t", cross_val_score(best_rfe, X_neutral_train, y_neutral_train, scoring="r2", cv=folds).mean(),
      "\n\tQ2\t:\t", best_rfe.score(X_neutral_test, y_neutral_test),
      "\nSelect features:", best_rfe.get_feature_names_out())

In [None]:
param_grid = {
    "estimator__alpha": np.linspace(1e-5, 100, 1000),
    "estimator__solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"]
}

cv = KFold(5)

grid = GridSearchCV(estimator=best_rfe, param_grid=param_grid, scoring="r2",verbose=2, n_jobs=-1, cv=cv)
grid.fit(X_neutral_train, y_neutral_train)


print("Best cross-val score\t:\t", grid.best_score_)
print("Best hyperparam score\t:\t", grid.best_estimator_)
print("R2\t: ", grid.best_estimator_.score(X_neutral_train, y_neutral_train))
print("Q2\t: ", grid.best_estimator_.score(X_neutral_test, y_neutral_test))

In [None]:
grid_results = pd.DataFrame(grid.cv_results_)


grid_results = grid_results[grid_results["param_estimator__alpha"] < 3]

plot = sns.lineplot(data=grid_results, x="param_estimator__alpha", y="mean_test_score", hue="param_estimator__solver")
plot.set(xlabel="Alpha value", ylabel="Cross value")

# Demo of good R2 Q2 but bad cross val

In [None]:
from sklearn.model_selection import LeaveOneOut, ShuffleSplit, StratifiedShuffleSplit

# THIS IS AN EXEMPLE OF A "GOOD" R2 and Q2 but it won't be found because the cross val is low



rr = Ridge()
rr.fit(X_full_train, y_full_train)

rfe = RFE(rr, n_features_to_select=20)
rfe = rfe.fit(X_full_train, y_full_train)


print(rfe.score(X_full_train, y_full_train))
print(rfe.score(X_full_test, y_full_test))
print(cross_val_score(rfe, X_full_train, y_full_train, scoring="r2", cv=ShuffleSplit(n_splits=4,test_size=0.1,random_state=0)).mean())


# Kernel Ridge

In [None]:
# https://stackoverflow.com/a/23835410
full_train = pd.read_csv("../../Data/Filtered/full_train.csv", sep=";")
full_test = pd.read_csv("../../Data/Filtered/full_test.csv", sep=";")

# TRAIN
X_full_train = full_train.loc[:, full_train.columns != "Log_MP_RATIO"]
y_full_train = full_train["Log_MP_RATIO"]

# TEST
X_full_test = full_test.loc[:, full_test.columns != "Log_MP_RATIO"]
y_full_test = full_test["Log_MP_RATIO"]

In [None]:
krr = KernelRidge()
krr.fit(X_full_train, y_full_train)
print("R2 train\t\t:\t", krr.score(X_full_train, y_full_train))
print("Cross-val train\t:\t", cross_val_score(krr, X_full_train, y_full_train, cv=3).mean())
print("Q2 test\t\t\t:\t", krr.score(X_full_test, y_full_test))

In [None]:
krr = KernelRidge()

krr.fit(X_full_train, y_full_train)

# folds = KFold(n_splits=0, shuffle=True, random_state=100)


hyper_params = [{"n_features_to_select": list(range(1,len(X_full_train.columns)))}]

# krr.fit(X_full_train, y_full_train)
rfe = RFE(krr, step=1)

model_cv = GridSearchCV(estimator=rfe,
                        param_grid=hyper_params,
                        scoring="r2",
                        cv=None,
                        verbose=2,
                        return_train_score=True,
                        n_jobs=-1)


model_cv.fit(X_full_train, y_full_train)

cv_results = pd.DataFrame(model_cv.cv_results_)

best_rfe = model_cv.best_estimator_


plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel("number of features")
plt.ylabel("R2")
plt.title("Optimal number of features")
plt.legend(["test score", "train score"], loc="upper left")
plt.show()


print("best model\t:\t",best_rfe.get_params(), "\n\tR2\t:\t", best_rfe.score(X_full_train, y_full_train),
      "\n\tCross-val\t:\t", cross_val_score(best_rfe, X_full_train, y_full_train, scoring="r2", cv=folds).mean(),
      "\n\tQ2\t:\t", best_rfe.score(X_full_test, y_full_test),
      "\nSelect features:", best_rfe.get_feature_names_out())

In [None]:
krr = KernelRidge()
param_grid = {
    "alpha": np.linspace(1e-5, 10000, 200),
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": np.linspace(1e-5, 10000, 200)
}
cv = KFold(5)

kernel_grid = GridSearchCV(estimator=krr, param_grid=param_grid, scoring="r2",verbose=2, n_jobs=-1, cv=cv)
kernel_grid.fit(X_full_train, y_full_train)

In [None]:
print("Best cross-val score\t:\t", kernel_grid.best_score_)
print("Best hyperparam score\t:\t", kernel_grid.best_estimator_)
print("R2\t: ", kernel_grid.best_estimator_.score(X_full_train, y_full_train))
print("Q2\t: ", kernel_grid.best_estimator_.score(X_full_test, y_full_test))

In [None]:
ridge = KernelRidge(alpha=10000.0, gamma=10.0001, kernel='linear')
ridge.fit(X_full_train, y_full_train)
print(ridge.score(X_full_train, y_full_train))

X_full = pd.concat([X_full_train, X_full_test])
y_full = pd.concat([y_full_train, y_full_test])

print(ridge.score(X_full_test, y_full_test))


print(X_full_train.shape)
print(X_full_test.shape)

In [None]:

from sklearn.svm import SVR

clf = SVR(kernel='rbf', gamma=0.089, epsilon=0.00001).fit(X_full_train, y_full_train)
print(clf.score(X_full_train, y_full_train))
print(clf.score(X_full_test, y_full_test))

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor().fit(X_full_train, y_full_train)
print("R2: ", xgb.score(X_full_train, y_full_train))
print("Q2: ", xgb.score(X_full_test, y_full_test))
print(cross_val_score(XGBRegressor(), X_full_train, y_full_train, cv=5).mean())


In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer

# https://stackoverflow.com/a/23835410
full_train = pd.read_csv("../../Data/Filtered/full_train.csv", sep=";")
full_test = pd.read_csv("../../Data/Filtered/full_test.csv", sep=";")

Scaler = RobustScaler()
Norm = Normalizer()
# TRAIN
X_full_train = full_train.loc[:, full_train.columns != "Log_MP_RATIO"]
y_full_train = full_train["Log_MP_RATIO"]
#
# Scaler.fit(X_full_train)
# X_full_train = pd.DataFrame(Scaler.transform(X_full_train), columns = X_full_train.columns)

# Norm.fit(X_full_train)
# X_full_train = pd.DataFrame(Norm.transform(X_full_train), columns=X_full_train.columns)

# TEST
X_full_test = full_test.loc[:, full_test.columns != "Log_MP_RATIO"]
y_full_test = full_test["Log_MP_RATIO"]
#
# Scaler.fit(X_full_test)
# X_full_test = pd.DataFrame(Scaler.transform(X_full_test), columns = X_full_test.columns)
#
# Norm.fit(X_full_test)
# X_full_test = pd.DataFrame(Norm.transform(X_full_test), columns=X_full_test.columns)

In [None]:
from sklearn.model_selection import StratifiedKFold

# THIS IS AN EXEMPLE OF A "GOOD" R2 and Q2 but it won't be found because the cross val is low

rr = Ridge()
rr.fit(X_full_train, y_full_train)

rfe = RFE(rr, n_features_to_select=20)
rfe = rfe.fit(X_full_train, y_full_train)

cv = StratifiedKFold(5)

print(rfe.score(X_full_train, y_full_train))
print(rfe.score(X_full_test, y_full_test))
print(cross_val_score(rfe, X_full_train, y_full_train, scoring="r2", cv=cv).mean())
