In [None]:
from ktools.utils.load_dataframes import load_all_dataframes
from ktools.modelling.ensemble.ridge_regression_blending_ensemble import RidgeRegressionBlendingEnsemble

In [None]:
oofs = load_all_dataframes("./data/diabetes_prediction/oofs/")
test_preds = load_all_dataframes("./data/diabetes_prediction/test_preds/")

In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score


def plot_test_dist_performance_against_nontest(oofs: pd.DataFrame, test_preds: pd.DataFrame):
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import mean_squared_error

    auc_tests = []
    auc_trains = []

    for model_name in test_preds.columns:

        true_labels_test = oofs.loc[oofs["data"] == 0, 'diagnosed_diabetes']
        true_labels_train = oofs.loc[oofs["data"] == 1, 'diagnosed_diabetes']

        oof_preds_train = oofs.loc[oofs["data"] == 1, model_name]
        oof_preds_test = oofs.loc[oofs["data"] == 0, model_name]

        auc_test = roc_auc_score(true_labels_test, oof_preds_test)
        # print(f'ROC AUC Score for {model_name}: {auc_test:.4f}')
        auc_train = roc_auc_score(true_labels_train, oof_preds_train)
        # print(f'ROC AUC Score for {model_name}: {auc_train:.4f}')

        auc_tests.append(auc_test)
        auc_trains.append(auc_train)

    plt.figure(figsize=(12, 6))
    plt.scatter(auc_trains, auc_tests)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlabel('Train-like OOF ROC AUC')
    plt.ylabel('Test-like OOF ROC AUC')
    plt.title('Train vs Test ROC AUC Scores')
    plt.grid()
    plt.show()

In [None]:
# plot_test_dist_performance_against_nontest(oofs, test_preds)

In [None]:
oofs = oofs[oofs["data"] == 0]

In [None]:
# oofs.drop(columns=[f for f in oofs.columns if f.endswith("BAG_L1")], inplace=True)
# test_preds.drop(columns=[f for f in test_preds.columns if f.endswith("BAG_L1")], inplace=True)

In [None]:
for model in test_preds.columns:
    predictions = oofs[model]
    auc = roc_auc_score(oofs["diagnosed_diabetes"], predictions)
    print(f"{model}: ROC AUC = {auc:.4f}")

In [None]:
from sklearn.metrics import roc_auc_score


ensemble = RidgeRegressionBlendingEnsemble(
    oof_dataframe=oofs[test_preds.columns],
    train_labels=oofs["diagnosed_diabetes"],
    metric=roc_auc_score,
    alpha=1e-6
)

ensemble.fit_weights()

In [None]:
# flaml_test_pred = pd.read_csv("/workspaces/Kaggle-tools/data/diabetes_prediction/test_preds/test_preds_ef225044-04a9-4761-83b3-eb7bb507da92.csv", index_col=0)

# sub_name = f"submissions/flaml_test_pred.csv"
# sample_sub = pd.read_csv("data/diabetes_prediction/sample_submission.csv", index_col=0)
# sample_sub["diagnosed_diabetes"] = flaml_test_pred.to_numpy()
# sample_sub.to_csv(sub_name)

In [None]:
final_test_preds = ensemble.predict(test_preds)

In [None]:
import pandas as pd
from datetime import datetime

sub_name = f"submissions/ridge_submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
sample_sub = pd.read_csv("data/diabetes_prediction/sample_submission.csv", index_col=0)
sample_sub["diagnosed_diabetes"] = final_test_preds
sample_sub.to_csv(sub_name)

In [None]:
other = pd.read_csv("/workspaces/Kaggle-tools/data/diabetes_prediction/submission (52).csv")

In [None]:
sample_sub["diagnosed_diabetes"] = final_test_preds*0.6 + other["diagnosed_diabetes"].to_numpy()*0.4
sample_sub.to_csv(sub_name)