In [None]:
import pandas as pd
import numpy as np
import os
import sys

from sklearn.datasets import make_classification

from sklearn.datasets import load_breast_cancer

from functions import *

from model_tuner.model_tuner_utils import Model
from model_tuner.bootstrapper import evaluate_bootstrap_metrics
from model_tuner.pickleObjects import dumpObjects, loadObjects

In [None]:
bc = load_breast_cancer(as_frame=True)["frame"]

In [None]:
bc_cols = [cols for cols in bc.columns if "target" not in cols]

In [None]:
X = bc[bc_cols]

In [None]:
X

In [None]:
y = bc["target"]

In [None]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression(class_weight="balanced", max_iter=1000)

estimator_name = "lg"
# Set the parameters by cross-validation
tuned_parameters = [
    {
        estimator_name + "__C": np.logspace(-4, 0, 3),
        "selectKBest__k": [5, 10, 11, 12, 13, 8, 6, 9, 20],
    }
]

In [None]:
kfold = False
calibrate = False

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Model(
    name="Logistic Regression",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=lr,
    kfold=kfold,
    stratify_y=True,
    stratify_cols=["mean radius"],
    grid=tuned_parameters,
    randomized_grid=True,
    n_iter=40,
    scoring=["roc_auc"],
    n_splits=10,
    selectKBest=True,
    n_jobs=-2,
    random_state=42,
)


model.grid_search_param_tuning(X, y)

X_train, y_train = model.get_train_data(X, y)
X_test, y_test = model.get_test_data(X, y)
X_valid, y_valid = model.get_valid_data(X, y)

model.fit(X_train, y_train)

print("Validation Metrics")
model.return_metrics(X_valid, y_valid)
print("Test Metrics")
model.return_metrics(X_test, y_test)

y_prob = model.predict_proba(X_test)

### F1 Weighted
y_pred = model.predict(X_test, optimal_threshold=True)

In [None]:
y_prob = y_prob[:, 1]

In [None]:
isinstance(y_prob, np.ndarray)

In [None]:
evaluate_bootstrap_metrics(y=y_test, y_pred_prob=y_prob, n_samples=2, num_resamples=10)

In [None]:
from xgboost import XGBClassifier


estimator = XGBClassifier(
    objective="binary:logistic",
)

estimator_name = "xgb"
xgbearly = True

tuned_parameters = {
    f"{estimator_name}__max_depth": [3],
    f"{estimator_name}__learning_rate": [1e-4],
    f"{estimator_name}__n_estimators": [100000],
    f"{estimator_name}__early_stopping_rounds": [2],
    f"{estimator_name}__verbose": [True],
    f"{estimator_name}__eval_metric": ["logloss"],
}

In [None]:
kfold = False
calibrate = False

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Model(
    name="XGBoost Early",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=estimator,
    kfold=kfold,
    stratify_y=True,
    grid=tuned_parameters,
    randomized_grid=True,
    n_iter=1,
    xgboost_early=True,
    scoring=["roc_auc"],
    n_splits=10,
    selectKBest=False,
    n_jobs=-2,
    random_state=42,
)


model.grid_search_param_tuning(X, y)

X_train, y_train = model.get_train_data(X, y)
X_test, y_test = model.get_test_data(X, y)
X_valid, y_valid = model.get_valid_data(X, y)

model.fit(X_train, y_train, validation_data=[X_valid, y_valid])

print("Validation Metrics")
model.return_metrics(X_valid, y_valid)
print("Test Metrics")
model.return_metrics(X_test, y_test)

y_prob = model.predict_proba(X_test)

### F1 Weighted
y_pred = model.predict(X_test, optimal_threshold=True)

In [None]:
model.best_params_per_score

In [None]:
from model_tuner import Model
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier


titanic = sns.load_dataset("titanic")
titanic.head()

X = titanic[[col for col in titanic.columns if col != "survived"]]
### Removing repeated data
X = X.drop(columns=["alive", "class", "embarked"])
y = titanic["survived"]

rf = RandomForestClassifier(class_weight="balanced")

estimator_name = "rf"

tuned_parameters = {
    f"{estimator_name}__max_depth": [3, 5, 10, None],
    f"{estimator_name}__n_estimators": [10, 100, 200],
    f"{estimator_name}__max_features": [1, 3, 5, 7],
    f"{estimator_name}__min_samples_leaf": [1, 2, 3],
}

X.head()

### Defining columns to be scaled and columns to be onehotencoded
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

ohencoder = OneHotEncoder(handle_unknown="ignore")

ohcols = ["parch", "embark_town", "who", "sex", "adult_male"]

ordencoder = OrdinalEncoder()

ordcols = ["deck"]

minmaxscaler = MinMaxScaler()

scalercols = ["fare", "age", "pclass"]


ct = ColumnTransformer(
    [
        ("OneHotEncoder", ohencoder, ohcols),
        ("OrdinalEncoder", ordencoder, ordcols),
        ("MinMaxScaler", minmaxscaler, scalercols),
    ],
    remainder="passthrough",
)

# Initialize ModelTuner
ModelTuner = Model(
    name="RandomForest_Titanic",
    estimator_name=estimator_name,
    calibrate=True,
    estimator=rf,
    kfold=True,
    impute=True,
    pipeline_steps=[("ColumnTransformer", ct)],
    stratify_y=False,
    n_splits=10,
    grid=tuned_parameters,
    randomized_grid=True,
    n_iter=1,
    scoring=["roc_auc"],
    random_state=42,
    n_jobs=-1,
)

In [None]:
ModelTuner.grid_search_param_tuning(X, y, f1_beta_tune=True)

ModelTuner.return_metrics(X, y)