In [13]:
import pandas as pd
import numpy as np
import seaborn as sns

import os
import sys
sys.path.append(os.path.join(os.pardir))

from sklearn.datasets import make_classification

from sklearn.datasets import load_breast_cancer

from functions import *

from src.model_tuner import Model
from src.model_tuner.bootstrapper import evaluate_bootstrap_metrics
from src.model_tuner.pickleObjects import dumpObjects, loadObjects

In [2]:
bc = load_breast_cancer(as_frame=True)["frame"]

In [3]:
bc_cols = [cols for cols in bc.columns if "target" not in cols]

In [4]:
X = bc[bc_cols]

In [5]:
y = bc["target"]

In [6]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression(class_weight="balanced", max_iter=1000)

estimator_name = "lg"
# Set the parameters by cross-validation
tuned_parameters = [
    {
        estimator_name + "__C": np.logspace(-4, 0, 3),
        "selectKBest__k": [5, 10, 11, 12, 13, 8, 6, 9, 20],
    }
]

In [7]:
kfold = False
calibrate = False

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Model(
    name="Logistic Regression",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=lr,
    kfold=kfold,
    stratify_y=True,
    grid=tuned_parameters,
    randomized_grid=True,
    n_iter=40,
    scoring=["roc_auc"],
    n_splits=10,
    selectKBest=True,
    n_jobs=-2,
    random_state=42,
)


model.grid_search_param_tuning(X, y)

X_train, y_train = model.get_train_data(X, y)
X_test, y_test = model.get_test_data(X, y)
X_valid, y_valid = model.get_valid_data(X, y)

model.fit(X_train, y_train)

print("Validation Metrics")
model.return_metrics(X_valid, y_valid)
print("Test Metrics")
model.return_metrics(X_test, y_test)

y_prob = model.predict_proba(X_test)

### F1 Weighted
y_pred = model.predict(X_test, optimal_threshold=True)

100%|██████████| 27/27 [00:00<00:00, 94.45it/s]

Best score/param set found on validation set:
{'params': {'lg__C': np.float64(1.0), 'selectKBest__k': 20},
 'score': np.float64(0.9983465608465609)}
Best roc_auc: 0.998 

Validation Metrics
Confusion matrix on set provided: 
--------------------------------------------------------------------------------
          Predicted:
            Pos  Neg
--------------------------------------------------------------------------------
Actual: Pos 41 (tp)   1 (fn)
        Neg  1 (fp)  71 (tn)
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

--------------------------------------------------------------------------------

Feature names selected




In [8]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [9]:
from xgboost import XGBClassifier


estimator = XGBClassifier(
    objective="binary:logistic",
)

estimator_name = "xgb"
xgbearly = True

tuned_parameters = {
    f"{estimator_name}__max_depth": [3],
    f"{estimator_name}__learning_rate": [1e-4],
    f"{estimator_name}__n_estimators": [100000],
    f"{estimator_name}__early_stopping_rounds": [2],
    f"{estimator_name}__verbose": [True],
    f"{estimator_name}__eval_metric": ["logloss"],
}

In [10]:
kfold = False
calibrate = False

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Model(
    name="XGBoost Early",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=estimator,
    kfold=kfold,
    stratify_y=True,
    grid=tuned_parameters,
    randomized_grid=True,
    n_iter=1,
    xgboost_early=True,
    scoring=["roc_auc"],
    n_splits=10,
    selectKBest=False,
    n_jobs=-2,
    random_state=42,
)


model.grid_search_param_tuning(X, y)

X_train, y_train = model.get_train_data(X, y)
X_test, y_test = model.get_test_data(X, y)
X_valid, y_valid = model.get_valid_data(X, y)

model.fit(X_train, y_train, validation_data=[X_valid, y_valid])

print("Validation Metrics")
model.return_metrics(X_valid, y_valid)
print("Test Metrics")
model.return_metrics(X_test, y_test)

y_prob = model.predict_proba(X_test)

### F1 Weighted
y_pred = model.predict(X_test, optimal_threshold=True)

  0%|          | 0/1 [00:00<?, ?it/s]

[0]	validation_0-logloss:0.65812
[1]	validation_0-logloss:0.65804
[2]	validation_0-logloss:0.65796
[3]	validation_0-logloss:0.65787
[4]	validation_0-logloss:0.65779
[5]	validation_0-logloss:0.65771
[6]	validation_0-logloss:0.65763
[7]	validation_0-logloss:0.65755
[8]	validation_0-logloss:0.65746
[9]	validation_0-logloss:0.65738
[10]	validation_0-logloss:0.65730
[11]	validation_0-logloss:0.65721
[12]	validation_0-logloss:0.65713
[13]	validation_0-logloss:0.65705
[14]	validation_0-logloss:0.65697
[15]	validation_0-logloss:0.65688
[16]	validation_0-logloss:0.65680
[17]	validation_0-logloss:0.65672
[18]	validation_0-logloss:0.65664
[19]	validation_0-logloss:0.65655
[20]	validation_0-logloss:0.65647
[21]	validation_0-logloss:0.65639
[22]	validation_0-logloss:0.65631
[23]	validation_0-logloss:0.65622
[24]	validation_0-logloss:0.65614
[25]	validation_0-logloss:0.65606
[26]	validation_0-logloss:0.65598
[27]	validation_0-logloss:0.65589
[28]	validation_0-logloss:0.65581
[29]	validation_0-loglos

100%|██████████| 1/1 [05:35<00:00, 335.44s/it]

Best score/param set found on validation set:
{'params': {'xgb__early_stopping_rounds': 2,
            'xgb__eval_metric': 'logloss',
            'xgb__learning_rate': 0.0001,
            'xgb__max_depth': 3,
            'xgb__n_estimators': 30506},
 'score': np.float64(0.9957010582010581)}
Best roc_auc: 0.996 

[0]	validation_0-logloss:0.65812
[1]	validation_0-logloss:0.65804
[2]	validation_0-logloss:0.65796
[3]	validation_0-logloss:0.65787
[4]	validation_0-logloss:0.65779
[5]	validation_0-logloss:0.65771
[6]	validation_0-logloss:0.65763
[7]	validation_0-logloss:0.65755
[8]	validation_0-logloss:0.65746
[9]	validation_0-logloss:0.65738
[10]	validation_0-logloss:0.65730
[11]	validation_0-logloss:0.65721
[12]	validation_0-logloss:0.65713
[13]	validation_0-logloss:0.65705
[14]	validation_0-logloss:0.65697
[15]	validation_0-logloss:0.65688
[16]	validation_0-logloss:0.65680
[17]	validation_0-logloss:0.65672
[18]	validation_0-logloss:0.65664
[19]	validation_0-logloss:0.65655
[20]	validation_




[52]	validation_0-logloss:0.65384
[53]	validation_0-logloss:0.65376
[54]	validation_0-logloss:0.65368
[55]	validation_0-logloss:0.65359
[56]	validation_0-logloss:0.65351
[57]	validation_0-logloss:0.65343
[58]	validation_0-logloss:0.65335
[59]	validation_0-logloss:0.65327
[60]	validation_0-logloss:0.65319
[61]	validation_0-logloss:0.65310
[62]	validation_0-logloss:0.65302
[63]	validation_0-logloss:0.65294
[64]	validation_0-logloss:0.65286
[65]	validation_0-logloss:0.65278
[66]	validation_0-logloss:0.65270
[67]	validation_0-logloss:0.65261
[68]	validation_0-logloss:0.65253
[69]	validation_0-logloss:0.65245
[70]	validation_0-logloss:0.65237
[71]	validation_0-logloss:0.65229
[72]	validation_0-logloss:0.65221
[73]	validation_0-logloss:0.65212
[74]	validation_0-logloss:0.65204
[75]	validation_0-logloss:0.65196
[76]	validation_0-logloss:0.65188
[77]	validation_0-logloss:0.65180
[78]	validation_0-logloss:0.65172
[79]	validation_0-logloss:0.65164
[80]	validation_0-logloss:0.65155
[81]	validatio

In [11]:
model.best_params_per_score

{'roc_auc': {'params': {'xgb__n_estimators': 30506,
   'xgb__max_depth': 3,
   'xgb__learning_rate': 0.0001,
   'xgb__eval_metric': 'logloss',
   'xgb__early_stopping_rounds': 2},
  'score': np.float64(0.9957010582010581)}}

In [15]:
from sklearn.ensemble import RandomForestClassifier

titanic = sns.load_dataset("titanic")
titanic.head()

X = titanic[[col for col in titanic.columns if col != "survived"]]
### Removing repeated data
X = X.drop(columns=["alive", "class", "embarked"])
y = titanic["survived"]

rf = RandomForestClassifier(class_weight="balanced")

estimator_name = "rf"

tuned_parameters = {
    f"{estimator_name}__max_depth": [3, 5, 10, None],
    f"{estimator_name}__n_estimators": [10, 100, 200],
    f"{estimator_name}__max_features": [1, 3, 5, 7],
    f"{estimator_name}__min_samples_leaf": [1, 2, 3],
}

X.head()

### Defining columns to be scaled and columns to be onehotencoded
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

ohencoder = OneHotEncoder(handle_unknown="ignore")

ohcols = ["parch", "embark_town", "who", "sex", "adult_male"]

ordencoder = OrdinalEncoder()

ordcols = ["deck"]

minmaxscaler = MinMaxScaler()

scalercols = ["fare", "age", "pclass"]


ct = ColumnTransformer(
    [
        ("OneHotEncoder", ohencoder, ohcols),
        ("OrdinalEncoder", ordencoder, ordcols),
        ("MinMaxScaler", minmaxscaler, scalercols),
    ],
    remainder="passthrough",
)

# Initialize ModelTuner
ModelTuner = Model(
    name="RandomForest_Titanic",
    estimator_name=estimator_name,
    calibrate=True,
    estimator=rf,
    kfold=True,
    impute=True,
    pipeline_steps=[("ColumnTransformer", ct)],
    stratify_y=False,
    n_splits=10,
    grid=tuned_parameters,
    randomized_grid=True,
    n_iter=1,
    scoring=["roc_auc"],
    random_state=42,
    n_jobs=-1,
)

In [16]:
ModelTuner.grid_search_param_tuning(X, y, f1_beta_tune=True)

ModelTuner.return_metrics(X, y)

# Tuning hyper-parameters for roc_auc
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Best score/param set found on development set:
{np.float64(0.8661848734252757): {'rf__max_depth': 10,
                                  'rf__max_features': 7,
                                  'rf__min_samples_leaf': 2,
                                  'rf__n_estimators': 10}}

Grid scores on development set:
0.866 (+/-0.075) for {'rf__n_estimators': 10, 'rf__min_samples_leaf': 2, 'rf__max_features': 7, 'rf__max_depth': 10}
Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  4.52it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  4.97it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  4.95it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  4.55it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  5.31it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  5.11it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  4.98it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  4.30it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  4.90it/s]


Fitting model with best params and tuning for best threshold ...


100%|██████████| 2/2 [00:00<00:00,  5.48it/s]

Confusion matrix on set provided: 
--------------------------------------------------------------------------------
          Predicted:
             Pos   Neg
--------------------------------------------------------------------------------
Actual: Pos 500 (tp)   49 (fn)
        Neg  53 (fp)  289 (tn)
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.90      0.91      0.91       549
           1       0.86      0.85      0.85       342

    accuracy                           0.89       891
   macro avg       0.88      0.88      0.88       891
weighted avg       0.89      0.89      0.89       891

--------------------------------------------------------------------------------





{'Classification Report': {'0': {'precision': 0.9041591320072333,
   'recall': 0.9107468123861566,
   'f1-score': 0.9074410163339383,
   'support': 549.0},
  '1': {'precision': 0.8550295857988166,
   'recall': 0.8450292397660819,
   'f1-score': 0.85,
   'support': 342.0},
  'accuracy': 0.8855218855218855,
  'macro avg': {'precision': 0.879594358903025,
   'recall': 0.8778880260761193,
   'f1-score': 0.8787205081669691,
   'support': 891.0},
  'weighted avg': {'precision': 0.8853013263918814,
   'recall': 0.8855218855218855,
   'f1-score': 0.8853929494582851,
   'support': 891.0}},
 'Confusion Matrix': array([[500,  49],
        [ 53, 289]])}