<a href="https://colab.research.google.com/github/virbickt/default-risk-prediction/blob/main/modeling_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Optimization of the data-processing steps

So far we have been rushing towards establishing a baseline without further considerations on the steps that we have made in the process. In what follows we will take a look back in order to determine whether the steps that we've made were optimal.

### GridSearch (best numerical (housing) features imputer)

- Having a pipeline at hand allows us to perform experiments by changing a single element of by the pipeline and keeping all the remaining modifications in place. In this case, we are only the chaning the scaler which is on of the building blocks of a pipeline which also comprises different imputers, an  encoder
- Also, having all the preprocessing steps sitting inside a pipeline assures that all the steps are executed when producing an output file.

Here our search space is defined by the following imputation methods:

In [None]:
housing = [
        "DAYS_EMPLOYED",
        "CNT_FAM_MEMBERS",
        "APARTMENTS_AVG",
        "BASEMENTAREA_AVG",
        "YEARS_BEGINEXPLUATATION_AVG",
        "YEARS_BUILD_AVG",
        "COMMONAREA_AVG",
        "ELEVATORS_AVG",
        "ENTRANCES_AVG",
        "FLOORSMAX_AVG",
        "FLOORSMIN_AVG",
        "LANDAREA_AVG",
        "LIVINGAPARTMENTS_AVG",
        "LIVINGAREA_AVG",
        "NONLIVINGAPARTMENTS_AVG",
        "NONLIVINGAREA_AVG",
        "APARTMENTS_MODE",
        "BASEMENTAREA_MODE",
        "YEARS_BEGINEXPLUATATION_MODE",
        "YEARS_BUILD_MODE",
        "COMMONAREA_MODE",
        "ELEVATORS_MODE",
        "ENTRANCES_MODE",
        "FLOORSMAX_MODE",
        "FLOORSMIN_MODE",
        "LANDAREA_MODE",
        "LIVINGAPARTMENTS_MODE",
        "LIVINGAREA_MODE",
        "NONLIVINGAPARTMENTS_MODE",
        "NONLIVINGAREA_MODE",
        "APARTMENTS_MEDI",
        "BASEMENTAREA_MEDI",
        "YEARS_BEGINEXPLUATATION_MEDI",
        "YEARS_BUILD_MEDI",
        "COMMONAREA_MEDI",
        "ELEVATORS_MEDI",
        "ENTRANCES_MEDI",
        "FLOORSMAX_MEDI",
        "FLOORSMIN_MEDI",
        "LANDAREA_MEDI",
        "LIVINGAPARTMENTS_MEDI",
        "LIVINGAREA_MEDI",
        "NONLIVINGAPARTMENTS_MEDI",
        "NONLIVINGAREA_MEDI",
        "TOTALAREA_MODE",
    ]

# list of imputers to test
list_imputers = [
    MeanMedianImputer(variables=housing),
    ArbitraryNumberImputer(arbitrary_number=0, variables=housing),
    RandomSampleImputer(variables=housing, seed="general"),
    EndTailImputer(variables=housing),
]

# define the search space
params = {"preprocessor__numerical__housing_imputer": list_imputers}

lgbm = LGBMClassifier()
pipe = pipeline_vendor(WORKING_train, lgbm)

# supply pipe using pipeline_vendor
gscv = GridSearchCV(pipe, params, n_jobs=-1, scoring="roc_auc", verbose=0, cv=5)

gscv.fit(WORKING_train, WORKING_y)

In [None]:
mm_scores = []
for i in range(5):
    mm_scores.append(gscv.cv_results_[f"split{i}_test_score"][0])

an_scores = []
for i in range(5):
    an_scores.append(gscv.cv_results_[f"split{i}_test_score"][1])

rs_scores = []
for i in range(5):
    rs_scores.append(gscv.cv_results_[f"split{i}_test_score"][2])

et_scores = []
for i in range(5):
    et_scores.append(gscv.cv_results_[f"split{i}_test_score"][3])

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(mm_scores) + 1), y=mm_scores, name="MeanMedianImputer"
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(an_scores) + 1), y=an_scores, name="ArbitraryNumberImputer"
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(rs_scores) + 1), y=rs_scores, name="RandomSampleImputer"
    )
)
fig.add_trace(
    go.Scatter(x=np.arange(1, len(et_scores) + 1), y=et_scores, name="constant (0)")
)

fig.update_layout(
    title="Cross-validation scores on a test set on 5 folds for SimpleImputer strategy",
    xaxis={"tickformat": ",d"},
    xaxis_title="Fold",
    yaxis_title="ROC AUC",
    title_font_family="Raleway",
)
fig.update_xaxes(title_font_family="Raleway")


fig.show()

### GridSearch (best numerical features imputer)

In [None]:
#prepare data for testing out the scalers
app_df = pd.read_parquet("/content/drive/MyDrive/341/app_df.gzip")
y = app_df.TARGET.values
X = app_df.drop(['SK_ID_CURR', 'TARGET'], axis=1)

In [None]:
list_strategies = [
    "mean",
    "median",
    "most_frequent",
    "constant",
]

params = {"preprocessor__numerical__imputer__strategy": list_strategies}

lgbm = LGBMClassifier()
pipe = pipeline_vendor(WORKING_train, lgbm)

# supply pipe using pipeline_vendor
gscv = GridSearchCV(pipe, params, n_jobs=-1, scoring="roc_auc", verbose=0, cv=5)

gscv.fit(WORKING_train, WORKING_y)

print(gscv.best_score_)
print(gscv.best_params_)

In [None]:
mean_scores = []
for i in range(5):
    mean_scores.append(gscv.cv_results_[f"split{i}_test_score"][0])

median_scores = []
for i in range(5):
    median_scores.append(gscv.cv_results_[f"split{i}_test_score"][1])

mode_scores = []
for i in range(5):
    mode_scores.append(gscv.cv_results_[f"split{i}_test_score"][2])

constant_scores = []
for i in range(5):
    constant_scores.append(gscv.cv_results_[f"split{i}_test_score"][3])

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=np.arange(1, len(mean_scores) + 1), y=mean_scores, name="mean")
)
fig.add_trace(
    go.Scatter(x=np.arange(1, len(median_scores) + 1), y=median_scores, name="median")
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(mode_scores) + 1), y=mode_scores, name="most_frequent"
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(constant_scores) + 1), y=constant_scores, name="constant (0)"
    )
)

fig.update_layout(
    title="Cross-validation scores on a test set on 5 folds for SimpleImputer strategy",
    xaxis={"tickformat": ",d"},
    xaxis_title="Fold",
    yaxis_title="ROC AUC",
    title_font_family="Raleway",
)
fig.update_xaxes(title_font_family="Raleway")


fig.show()

### GridSearch (best encoder)

In [None]:
list_encoders = [
    OneHotEncoder(handle_unknown="ignore"),
    CountFrequencyEncoder(encoding_method="frequency"),
    OrdinalEncoder(),
    MeanEncoder(),
    WoEEncoder(),
    PRatioEncoder(),
    DecisionTreeEncoder(scoring="roc_auc"),
]

params = {"preprocessor__categorical__encoder": list_encoders}

lgbm = LGBMClassifier()
pipe = pipeline_vendor(app_df, lgbm)


# supply pipe using pipeline_vendor
gscv = GridSearchCV(pipe, params, n_jobs=-1, scoring="roc_auc", verbose=0, cv=5)

gscv.fit(X, y)

print(gscv.best_score_)
print(gscv.best_params_)

In [None]:
ohe_scores = []
for i in range(5):
    ohe_scores.append(gscv.cv_results_[f"split{i}_test_score"][0])

cfe_scores = []
for i in range(5):
    cfe_scores.append(gscv.cv_results_[f"split{i}_test_score"][1])

oe_scores = []
for i in range(5):
    oe_scores.append(gscv.cv_results_[f"split{i}_test_score"][2])

me_scores = []
for i in range(5):
    me_scores.append(gscv.cv_results_[f"split{i}_test_score"][3])

woee_scores = []
for i in range(5):
    woee_scores.append(gscv.cv_results_[f"split{i}_test_score"][4])

pre_scores = []
for i in range(5):
    pre_scores.append(gscv.cv_results_[f"split{i}_test_score"][5])

dte_scores = []
for i in range(5):
    dte_scores.append(gscv.cv_results_[f"split{i}_test_score"][6])

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=np.arange(1, len(ohe_scores) + 1), y=ohe_scores, name="OneHotEncoder")
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(cfe_scores) + 1), y=cfe_scores, name="CountFrequencyEncoder"
    )
)
fig.add_trace(
    go.Scatter(x=np.arange(1, len(oe_scores) + 1), y=oe_scores, name="OrdinalEncoder")
)
fig.add_trace(
    go.Scatter(x=np.arange(1, len(me_scores) + 1), y=me_scores, name="MeanEncoder")
)
fig.add_trace(
    go.Scatter(x=np.arange(1, len(pre_scores) + 1), y=pre_scores, name="PRatioEncoder")
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(dte_scores) + 1), y=dte_scores, name="DecisionTreeEncoder"
    )
)

fig.update_layout(
    title=" Cross-validation scores on a test set for 5 folds (Various encodings for categorical variables)",
    xaxis={"tickformat": ",d"},
    xaxis_title="Fold",
    yaxis_title="ROC AUC",
    title_font_family="Raleway",
)
fig.update_xaxes(title_font_family="Raleway")


fig.show()

### GridSearch (best scaler)

In [None]:
list_scalers = [
    StandardScaler(),
    RobustScaler(),
    MinMaxScaler(),
    QuantileTransformer(output_distribution="normal"),
]

params = {"preprocessor__numerical__scaler": list_scalers}

lgbm = LGBMClassifier()
pipe = pipeline_vendor(app_df, lgbm)

# supply pipe using pipeline_vendor
gscv = GridSearchCV(pipe, params, n_jobs=-1, scoring="roc_auc", verbose=0, cv=5)

gscv.fit(X, y)

print(gscv.best_score_)
print(gscv.best_params_)

NameError: ignored

In [None]:
ss_scores = []
for i in range(5):
    ss_scores.append(gscv.cv_results_[f"split{i}_test_score"][0])

rs_scores = []
for i in range(5):
    rs_scores.append(gscv.cv_results_[f"split{i}_test_score"][1])

mm_scores = []
for i in range(5):
    mm_scores.append(gscv.cv_results_[f"split{i}_test_score"][2])

qt_scores = []
for i in range(5):
    qt_scores.append(gscv.cv_results_[f"split{i}_test_score"][3])

NameError: ignored

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=np.arange(1, len(ss_scores) + 1), y=ss_scores, name="StandardScaler")
)
fig.add_trace(
    go.Scatter(x=np.arange(1, len(rs_scores) + 1), y=rs_scores, name="RobustScaler")
)
fig.add_trace(
    go.Scatter(x=np.arange(1, len(mm_scores) + 1), y=mm_scores, name="MinMaxScaler")
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(qt_scores) + 1), y=qt_scores, name="QuantileTransformer"
    )
)

fig.update_layout(
    title="Cross-validation scores on a test set on 5 folds",
    xaxis={"tickformat": ",d"},
    xaxis_title="Fold",
    yaxis_title="ROC AUC",
    title_font_family="Raleway",
)
fig.update_xaxes(title_font_family="Raleway")


fig.show()

## Testing alternative models

In [None]:
WORKING_train["target"] = WORKING_y.values
pycaret_experiment = pycr.setup(data=WORKING_train, target="target", session_id=1013)

best = pycr.compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9203,0.7798,0.0296,0.5541,0.0562,0.0485,0.1144,6.363
gbc,Gradient Boosting Classifier,0.9201,0.7732,0.0201,0.5518,0.0388,0.0333,0.0939,173.207
lda,Linear Discriminant Analysis,0.9127,0.7719,0.1331,0.375,0.1964,0.1612,0.1858,7.303
ada,Ada Boost Classifier,0.9199,0.7691,0.0224,0.5188,0.0429,0.0364,0.0948,31.874
et,Extra Trees Classifier,0.92,0.7445,0.0045,0.6584,0.009,0.0079,0.0494,49.637
rf,Random Forest Classifier,0.92,0.7438,0.0079,0.5633,0.0157,0.0134,0.0597,50.579
lr,Logistic Regression,0.9198,0.6583,0.0002,0.1583,0.0003,0.0002,0.0024,39.586
nb,Naive Bayes,0.6984,0.6243,0.4353,0.1273,0.1849,0.075,0.0988,0.35
dt,Decision Tree Classifier,0.8572,0.5473,0.1783,0.1567,0.1668,0.089,0.0893,21.997
knn,K Neighbors Classifier,0.9143,0.5465,0.0141,0.1437,0.0256,0.0116,0.0209,80.756


In [None]:
lr = pycr.create_model('lr')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9198,0.6659,0.0,0.0,0.0,-0.0001,-0.002
1,0.9199,0.6568,0.0006,1.0,0.0012,0.0011,0.0231
2,0.9198,0.6629,0.0,0.0,0.0,-0.0001,-0.002
3,0.9197,0.6508,0.0006,0.25,0.0012,0.0008,0.0085
4,0.9197,0.657,0.0,0.0,0.0,-0.0002,-0.0028
5,0.9198,0.6419,0.0006,0.3333,0.0012,0.0009,0.011
6,0.9197,0.6742,0.0,0.0,0.0,-0.0002,-0.0028
7,0.9198,0.6634,0.0,0.0,0.0,-0.0001,-0.002
8,0.9197,0.6508,0.0,0.0,0.0,-0.0003,-0.0035
9,0.9197,0.6597,0.0,0.0,0.0,-0.0003,-0.0035


In [None]:
lr_results = pull()
lr_results = lr_results.iloc[:-2, :]['AUC'].values

In [None]:
gbc = pycr.create_model('gbc')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9203,0.7785,0.0203,0.5932,0.0392,0.0341,0.099
1,0.9206,0.7706,0.0214,0.6379,0.0415,0.0365,0.1067
2,0.9203,0.7732,0.0191,0.6,0.0371,0.0323,0.0969
3,0.9203,0.7771,0.0226,0.5735,0.0435,0.0376,0.1023
4,0.9204,0.7776,0.0243,0.5833,0.0467,0.0406,0.1073
5,0.9196,0.7606,0.0151,0.4643,0.0292,0.0243,0.0722
6,0.9196,0.7793,0.0174,0.4688,0.0335,0.0279,0.0781
7,0.9199,0.7726,0.0203,0.5072,0.039,0.0331,0.0892
8,0.92,0.771,0.018,0.5167,0.0347,0.0295,0.085
9,0.9203,0.772,0.0226,0.5735,0.0435,0.0376,0.1023


In [None]:
gbc_results = pull()
gbc_results = gbc_results.iloc[:-2, :]['AUC'].values

In [None]:
lgbm = pycr.create_model("lightgbm")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9208,0.7874,0.0313,0.6207,0.0596,0.0523,0.1268
1,0.9203,0.7779,0.029,0.5556,0.0551,0.0475,0.1134
2,0.9199,0.7764,0.0272,0.5054,0.0517,0.0438,0.1031
3,0.9201,0.783,0.0295,0.5312,0.056,0.0479,0.1112
4,0.9207,0.7837,0.0336,0.6042,0.0637,0.0557,0.1291
5,0.9201,0.7678,0.0249,0.5309,0.0476,0.0407,0.102
6,0.9203,0.785,0.029,0.5618,0.0551,0.0476,0.1143
7,0.92,0.7791,0.0313,0.5192,0.059,0.0504,0.1127
8,0.9206,0.7782,0.0313,0.5806,0.0594,0.0516,0.1214
9,0.9201,0.779,0.029,0.5319,0.0549,0.0471,0.1102


In [None]:
lgbm_results = pull()
lgbm_results = lgbm_results.iloc[:-2, :]['AUC'].values

In [None]:
# @title scores for mingling with the plot

lgbm_results = [
    0.757,
    0.7514,
    0.7513,
    0.7569,
    0.762,
    0.737,
    0.7571,
    0.7546,
    0.7509,
    0.7511,
]
gbc_results = [
    0.752,
    0.7458,
    0.7502,
    0.7544,
    0.7559,
    0.734,
    0.7554,
    0.751,
    0.748,
    0.7474,
]
lr_results = [
    0.7416,
    0.7402,
    0.75,
    0.7477,
    0.753,
    0.7296,
    0.7509,
    0.749,
    0.7451,
    0.7414,
]

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(lgbm_results) + 1), y=lgbm_results, name="LGBMClassifier"
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(gbc_results) + 1),
        y=gbc_results,
        name="GradientBoostingClassifier",
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(lr_results) + 1), y=lr_results, name="LogisticRegression"
    )
)

fig.update_layout(
    title="Cross-validation scores with 5 folds for LGBMClassifier, GradientBoostingClassifier and LogisticRegression",
    xaxis=dict(tickmode="linear", tick0=0, dtick=1),
    xaxis_title="Fold",
    yaxis_title="ROC AUC",
    title_font_family="Raleway",
)

fig.update_xaxes(title_font_family="Raleway")


fig.show()

## Tuning the models

### LogisticRegression

In [None]:
lr = pycr.tune_model(lr, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9198,0.6659,0.0,0.0,0.0,-0.0001,-0.002
1,0.9198,0.6555,0.0,0.0,0.0,0.0,0.0
2,0.9198,0.6622,0.0,0.0,0.0,-0.0001,-0.002
3,0.9197,0.6514,0.0006,0.25,0.0012,0.0008,0.0085
4,0.9197,0.6569,0.0,0.0,0.0,-0.0002,-0.0028
5,0.9198,0.6422,0.0006,0.3333,0.0012,0.0009,0.011
6,0.9197,0.6746,0.0,0.0,0.0,-0.0003,-0.0035
7,0.9198,0.6637,0.0,0.0,0.0,-0.0001,-0.002
8,0.9196,0.6508,0.0006,0.1429,0.0012,0.0005,0.0042
9,0.9197,0.6599,0.0,0.0,0.0,-0.0003,-0.0035


In [None]:
lr_results = pull()
lr_results = lr_results.iloc[:-2, :]['AUC'].values

### GradientBoostingClassifier

In [None]:
gbc = pycr.tune_model(gbc, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9201,0.7821,0.0353,0.5214,0.0662,0.0566,0.1201
1,0.9201,0.7715,0.029,0.5263,0.0549,0.0469,0.1094
2,0.9202,0.7754,0.033,0.5377,0.0622,0.0534,0.1185
3,0.9201,0.7813,0.0342,0.5315,0.0642,0.0551,0.1197
4,0.92,0.7795,0.0272,0.5222,0.0518,0.0442,0.1055
5,0.9202,0.7623,0.0319,0.5392,0.0602,0.0517,0.1166
6,0.9199,0.7801,0.0295,0.505,0.0558,0.0474,0.1074
7,0.9194,0.7771,0.0383,0.4681,0.0707,0.0593,0.116
8,0.9207,0.7745,0.0336,0.5859,0.0636,0.0554,0.1266
9,0.9201,0.776,0.0313,0.5294,0.0591,0.0506,0.1141


In [None]:
gbc_results = pull()
gbc_results = gbc_results.iloc[:-2, :]['AUC'].values

### LightGBM

In [None]:
lgbm = pycr.tune_model(lgbm, optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9207,0.7858,0.033,0.5938,0.0626,0.0546,0.1266
1,0.9205,0.7777,0.0301,0.5843,0.0573,0.0498,0.1196
2,0.9206,0.7769,0.0348,0.5769,0.0656,0.057,0.1274
3,0.9206,0.7805,0.0342,0.5842,0.0646,0.0562,0.1274
4,0.9205,0.7853,0.0348,0.566,0.0655,0.0568,0.1259
5,0.9196,0.7669,0.0249,0.4778,0.0474,0.0397,0.0949
6,0.9202,0.7833,0.0342,0.5364,0.0643,0.0552,0.1204
7,0.9201,0.7757,0.0359,0.5254,0.0673,0.0576,0.1218
8,0.9213,0.7777,0.0354,0.6778,0.0672,0.0597,0.1426
9,0.9204,0.7796,0.0313,0.5684,0.0593,0.0514,0.1197


In [None]:
lgbm_results = pull()
lgbm_results = lgbm_results.iloc[:-2, :]['AUC'].values

### Results

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(lgbm_results) + 1), y=lgbm_results, name="LGBMClassifier"
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(gbc_results) + 1),
        y=gbc_results,
        name="GradientBoostingClassifier",
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(lr_results) + 1), y=lr_results, name="LogisticRegression"
    )
)

fig.update_layout(
    title="Cross-validation scores with 5 folds (after tuning) for LGBMClassifier, GradientBoostingClassifier and LogisticRegression",
    xaxis=dict(tickmode="linear", tick0=0, dtick=1),
    xaxis_title="Fold",
    yaxis_title="ROC AUC",
    title_font_family="Raleway",
)

fig.update_xaxes(title_font_family="Raleway")


fig.show()

## Tuning LightGBM using alternative hyperoptimization techniques

In [None]:
lgbm_gscv = pycr.tune_model(lgbm, search_library = "scikit-learn", optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9207,0.7858,0.033,0.5938,0.0626,0.0546,0.1266
1,0.9205,0.7777,0.0301,0.5843,0.0573,0.0498,0.1196
2,0.9206,0.7769,0.0348,0.5769,0.0656,0.057,0.1274
3,0.9206,0.7805,0.0342,0.5842,0.0646,0.0562,0.1274
4,0.9205,0.7853,0.0348,0.566,0.0655,0.0568,0.1259
5,0.9196,0.7669,0.0249,0.4778,0.0474,0.0397,0.0949
6,0.9202,0.7833,0.0342,0.5364,0.0643,0.0552,0.1204
7,0.9201,0.7757,0.0359,0.5254,0.0673,0.0576,0.1218
8,0.9213,0.7777,0.0354,0.6778,0.0672,0.0597,0.1426
9,0.9204,0.7796,0.0313,0.5684,0.0593,0.0514,0.1197


In [None]:
lgbm_gscv_results = pull()
lgbm_gscv_results = lgbm_gscv_results.iloc[:-2, :]['AUC'].values

In [None]:
#@title for future reference

lgbm_gscv_results = [0.7552, 0.7508, 0.7558, 0.756 , 0.7604, 0.7348, 0.7579, 0.7509,
       0.749 , 0.749 ]

lgbm_hyperopt_results = [0.7584, 0.7504, 0.7534, 0.7561, 0.764 , 0.7398, 0.7579, 0.7529,
       0.7496, 0.7503]

In [None]:
lgbm_gscv = pycr.tune_model(lgbm, search_library = "tune-sklearn", search_algorithm="hyperopt", optimize = 'AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9204,0.7884,0.0278,0.5783,0.0531,0.046,0.1141
1,0.9203,0.7812,0.0238,0.5775,0.0456,0.0395,0.1053
2,0.9201,0.7802,0.0278,0.5275,0.0528,0.0452,0.1073
3,0.9209,0.7846,0.0301,0.65,0.0576,0.0508,0.1281
4,0.9204,0.7855,0.0278,0.5783,0.0531,0.046,0.1141
5,0.9202,0.7676,0.0249,0.5513,0.0477,0.041,0.1046
6,0.9204,0.7875,0.0284,0.5765,0.0541,0.0469,0.1151
7,0.9204,0.7799,0.0325,0.5545,0.0613,0.0529,0.1199
8,0.9202,0.7787,0.0232,0.5479,0.0445,0.0382,0.1005
9,0.9199,0.78,0.0243,0.5122,0.0465,0.0395,0.0984


In [None]:
lgbm_gscv_results = pull()
lgbm_hyperopt_results = lgbm_hyperopt_results.iloc[:-2, :]['AUC'].values

### Results

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(lgbm_gscv_results) + 1),
        y=lgbm_gscv_results,
        name="Random Search via sklearn",
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(1, len(lgbm_hyperopt_results) + 1),
        y=lgbm_hyperopt_results,
        name="Hyperopt via tune-sklearn",
    )
)

fig.update_layout(
    title="Hyperoptimization results using GridSearch and Hyperopt",
    xaxis=dict(tickmode="linear", tick0=0, dtick=1),
    xaxis_title="Fold",
    yaxis_title="ROC AUC",
    title_font_family="Raleway",
)

fig.update_xaxes(title_font_family="Raleway")


fig.show()