In [1]:
import pandas as pd
import numpy as np
from model_tuner import *

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.datasets import fetch_california_housing
import sklearn
sklearn.__version__

'1.4.1.post1'

In [18]:
# Generate a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=10, random_state=3)

# Specify the fraction of missing values
missing_rate = 0.1 # 10% of the data

# Introduce missing values
n_missing_samples = int(np.floor(X.shape[0] * X.shape[1] * missing_rate))
missing_features = np.random.randint(0, X.shape[1], n_missing_samples)
missing_samples = np.random.randint(0, X.shape[0], n_missing_samples)

X[missing_samples, missing_features] = np.nan

# Now X contains approximately 10% missing values randomly distributed across features


In [4]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.414376,0.182521,0.789585,0.497341,-0.849351,1.760105,-0.090954,0.285908,-0.052657,0.941365,0.479735,0.613002,1.167744,0.111211,0.651495,-0.133587,-0.394564,1.760317,0.490611,-1.158292
1,0.079135,-0.023836,1.269232,0.973063,-1.369758,0.274984,1.303002,0.346709,0.177431,1.148324,0.475655,1.031931,-0.853170,0.545008,1.077294,-0.175634,-0.259002,1.344684,-1.354232,-0.360856
2,0.024512,0.645435,0.985949,-0.338396,-0.919803,0.173138,-0.772209,-0.492844,0.706460,0.233349,-0.639109,-0.705763,1.617775,-0.798275,-0.135761,1.383020,0.955536,-0.358872,-0.113118,-0.456533
3,-1.909487,-0.989729,-1.539187,-0.550363,1.465097,0.066775,-0.889806,0.740707,-0.359541,0.308907,-1.191374,0.796908,0.038601,-0.707844,0.015224,0.964052,1.964292,-0.327374,-0.543510,0.367453
4,1.753052,-1.336489,-0.801691,1.069555,0.922482,0.700002,2.297450,-0.986477,0.511885,-1.380003,-2.667678,-1.250577,0.080804,-0.543195,-1.066806,0.038219,-1.485789,1.345724,-0.553978,0.146755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.623647,0.378110,1.123586,-1.028756,-0.888197,-0.284162,-0.011127,0.741618,-1.112942,-0.068952,-1.529410,-2.476514,-0.035311,-0.287218,-1.233692,-0.623098,0.068834,-0.820280,-1.036851,-0.042465
996,0.144369,-0.443870,-0.157825,-0.114114,0.016665,0.723121,1.393852,-0.249366,0.572946,0.561545,0.297401,1.477560,1.493135,-1.180944,0.902210,1.634516,-1.225584,3.048346,0.566492,-2.346420
997,-1.189782,-1.588756,-1.503617,1.769899,1.540181,-0.171873,-0.490112,-0.176556,-0.794228,-1.361682,0.858887,-0.360031,1.707408,0.319989,-0.719743,0.095633,0.585711,-0.238498,-0.477121,-0.667484
998,-1.927024,1.275973,-0.599173,-0.744975,0.724607,-1.093800,-0.727646,0.106605,-0.598735,0.904535,0.216272,-1.302088,0.945036,1.925173,-1.034391,-1.916527,-0.023381,-0.653098,1.167552,0.381811


In [5]:
rf_makeclass = RandomForestClassifier(random_state=3)

estimator_name = "rf_makeclass"

In [6]:
# # Define the hyperparameters for Random Forest
# rf_n_estimators = [100, 200, 300]
# rf_max_depths = [None, 5, 10]
# rf_criterions = ['gini', 'entropy']

# rf_parameters = [{'RF__n_estimators': [n_estimator], 'RF__max_depth': [max_depth], 
#                   'RF__criterion': [criterion]}
#                  for n_estimator in rf_n_estimators
#                  for max_depth in rf_max_depths
#                  for criterion in rf_criterions]


In [7]:
lr = LogisticRegression(class_weight="balanced", C=1, max_iter=1000)

estimator_name = "lr"
# Set the parameters by cross-validation
tuned_parameters = [{estimator_name + "__C": np.logspace(-4, 0, 10)}]

In [17]:
kfold = True
calibrate = True

model = Model(
    name="Random Logistic Regression",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=lr,
    kfold=kfold,
    stratify=True,
    grid=tuned_parameters,
    randomized_grid=False,
    n_iter=3,
    scoring=["roc_auc", "precision_macro"],
    # n_splits=2,
    random_state=3,
)

model.grid_search_param_tuning(X, y)

model.fit(X, y)

# Tuning hyper-parameters for roc_auc
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END ........................lr__C=0.0002782559402207126; total time=   0.0s
[CV] END

Traceback (most recent call last):
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tu

[CV] END ..........................lr__C=0.12915496650148828; total time=   0.0s
[CV] END ..........................lr__C=0.12915496650148828; total time=   0.0s
[CV] END ..........................lr__C=0.12915496650148828; total time=   0.0s
[CV] END ..........................lr__C=0.12915496650148828; total time=   0.0s
[CV] END ..........................lr__C=0.12915496650148828; total time=   0.0s
[CV] END ..........................lr__C=0.12915496650148828; total time=   0.0s
[CV] END ..........................lr__C=0.12915496650148828; total time=   0.0s
[CV] END ...........................lr__C=0.3593813663804626; total time=   0.0s
[CV] END ...........................lr__C=0.3593813663804626; total time=   0.0s
[CV] END ...........................lr__C=0.3593813663804626; total time=   0.0s
[CV] END ...........................lr__C=0.3593813663804626; total time=   0.0s
[CV] END ...........................lr__C=0.3593813663804626; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tu

[CV] END .........................lr__C=0.005994842503189409; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] END .........................lr__C=0.046415888336127774; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tuner/model_tuner/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/mnt/c/Users/lshpaner/Documents/dev_ops_repos/model_tu

In [None]:
# # Load the California housing dataset
# california_housing = fetch_california_housing(as_frame=True)

In [None]:
# df = california_housing["frame"]

In [13]:
iris = load_iris()
iris = pd.DataFrame(
    data=np.c_[iris["data"], iris["target"]],
    columns=iris["feature_names"] + ["target"],
)
features = [col for col in iris.columns if col != "target"]
target = "target"

X = iris[features].values  # independant variables
y = iris[target].values.astype(int)  # dependent variable

# breast_sk = load_breast_cancer()
# breast = pd.DataFrame(
#     data=np.c_[breast_sk.data, breast_sk.target],
# )
# breast.columns = list(breast_sk.feature_names) + ["target"]
# features = [col for col in breast.columns if col != "target"]
# target = "target"

# X = breast[features].values  # independant variables
# y = breast[target].values.astype(int)  # dependent variable

lr = LogisticRegression(class_weight="balanced", C=1, max_iter=1000)

estimator_name = "lr"
# Set the parameters by cross-validation
tuned_parameters = [{estimator_name + "__C": np.logspace(-4, 0, 10)}]

In [14]:
kfold = True
calibrate = True

model = Model(
    name="Iris_model",
    estimator_name=estimator_name,
    calibrate=calibrate,
    estimator=lr,
    kfold=kfold,
    stratify=True,
    grid=tuned_parameters,
    randomized_grid=False,
    n_iter=3,
    scoring=["roc_auc_ovr", "precision_macro"],
    n_splits=2,
    random_state=3,
)

model.grid_search_param_tuning(X, y)

model.fit(X, y)

# Tuning hyper-parameters for roc_auc_ovr
Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END .......................................lr__C=0.0001; total time=   0.0s
[CV] END ........................lr__C=0.0002782559402207126; total time=   0.0s
[CV] END ........................lr__C=0.0002782559402207126; total time=   0.0s
[CV] END .........................lr__C=0.000774263682681127; total time=   0.0s
[CV] END .........................lr__C=0.000774263682681127; total time=   0.0s
[CV] END .........................lr__C=0.002154434690031882; total time=   0.0s
[CV] END .........................lr__C=0.002154434690031882; total time=   0.0s
[CV] END .........................lr__C=0.005994842503189409; total time=   0.0s
[CV] END .........................lr__C=0.005994842503189409; total time=   0.0s
[CV] END .........................lr__C=0.016681005372000592; total time=   0.0s
[CV] E

In [15]:
if model.calibrate:
    model.calibrateModel(X, y)
else:
    pass

if kfold:
    print(model.xval_output["train_score"], model.xval_output["test_score"])
    for i in range(len(model.xval_output["estimator"])):
        print("\n" + str(i) + " Fold: ")
        if calibrate:
            importance = (
                model.xval_output["estimator"][i]
                .calibrated_classifiers_[i]
                .estimator.steps[1][1]
                .coef_[0]
            )
        else:
            importance = model.xval_output["estimator"][i].steps[1][1].coef_[0]

        sort_imp_indx = np.argsort(importance)[::-1]
        # print(importance)
        # print(sort_imp_indx)
        for i in sort_imp_indx:
            print("Feature: %s, Score: %.5f" % (features[i], importance[i]))
else:
    if calibrate:
        importance = model.estimator.estimator.steps[1][1].coef_[0]
    else:
        importance = model.estimator.steps[1][1].coef_[0]
    sort_imp_indx = np.argsort(importance)[::-1]
    # print(importance)
    # print(sort_imp_indx)
    # summarize feature importance
    for i in sort_imp_indx:
        print("Feature: %s, Score: %.5f" % (features[i], importance[i]))


[0.97893333 0.96666667] [0.95786667 0.97893333]

0 Fold: 
Feature: sepal width (cm), Score: 0.66715
Feature: sepal length (cm), Score: -0.95095
Feature: petal width (cm), Score: -1.34057
Feature: petal length (cm), Score: -1.65491

1 Fold: 
Feature: sepal width (cm), Score: 0.85867
Feature: sepal length (cm), Score: -0.94016
Feature: petal width (cm), Score: -1.49184
Feature: petal length (cm), Score: -1.54397
