In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import ray
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from evaluators import evaluate_model_ray
from ray.tune.search.hebo import HEBOSearch

load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm
2024-10-01 01:15:56,182	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-10-01 01:15:56,305	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


True

In [2]:
# Assuming feature_df and targets_df are already defined
data_dir = os.getenv("DATA")
encoded_df = pd.read_csv(Path(data_dir) / "encoded_df.csv")
targets_df = pd.read_csv(Path(data_dir) / "target.csv")
encoded_df.head()

Unnamed: 0,id,Geschlecht,Alter,Fahrerlaubnis,Vorversicherung,Alter_Fzg,Vorschaden,Jahresbeitrag,Kundentreue,Regional_Code_0,...,Vertriebskanal_152.0,Vertriebskanal_153.0,Vertriebskanal_154.0,Vertriebskanal_155.0,Vertriebskanal_156.0,Vertriebskanal_157.0,Vertriebskanal_158.0,Vertriebskanal_159.0,Vertriebskanal_160.0,Vertriebskanal_163.0
0,1.0,1.0,44.0,1.0,0.0,2.0,1.0,40454.0,217.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,1.0,76.0,1.0,0.0,0.0,0.0,33536.0,183.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1.0,47.0,1.0,0.0,2.0,1.0,38294.0,27.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,1.0,21.0,1.0,1.0,1.0,0.0,28619.0,203.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,29.0,1.0,1.0,1.0,0.0,27496.0,39.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sub_encoded_df = encoded_df.sample(n=10000, random_state=42)

normal_df = pd.DataFrame(scaler.fit_transform(sub_encoded_df.drop(columns="id")),
                         columns=encoded_df.columns.difference(["id"]))

In [4]:
from sklearn.preprocessing import PolynomialFeatures

# Generate polynomial features (degree 2 is a common starting point)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(sub_encoded_df)
cols = poly.get_feature_names_out(sub_encoded_df.columns)
poly_df = pd.DataFrame(poly.transform(sub_encoded_df.copy()), columns=cols)

In [5]:
ids = sub_encoded_df.id
y = targets_df.set_index("id").loc[ids]["Interesse"].values
X = normal_df.values
X.shape

(10000, 216)

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [7]:
X_id = ray.put(X)
y_id = ray.put(y)


def train_ensemble(config):
    X = ray.get(X_id)
    y = ray.get(y_id)
    estimators = []

    # Decision Tree
    if config['use_dt']:
        dt = DecisionTreeClassifier(max_depth=config['dt.max_depth'],
                                    min_samples_split=config['dt.min_samples_split'])
        estimators.append(('dt', dt))

    # SVM
    if config['use_svm']:
        svm = SVC(C=config['svm.C'], kernel=config['svm.kernel'], probability=True)
        estimators.append(('svm', svm))

    # Random Forest
    if config['use_rf']:
        rf = RandomForestClassifier(n_estimators=config['rf.n_estimators'],
                                    max_depth=config['rf.max_depth'],
                                    min_samples_split=config['rf.min_samples_split'])
        estimators.append(('rf', rf))

    # Gaussian Naive Bayes
    if config['use_gnb']:
        gnb = GaussianNB(var_smoothing=config['gnb.var_smoothing'])
        estimators.append(('gnb', gnb))

    # Logistic Regression
    if config['use_lr']:
        solver = config['lr.solver']
        penalty = config['lr.penalty']
        if solver == 'liblinear' and penalty == None:
            penalty = 'l2'
        elif solver in ["lbfgs", "newton-cg"] and penalty not in ['l2', None]:
            penalty = 'l2'
        if penalty == "elasticnet" and solver not in ['saga', 'sag']:
            penalty = 'l2'
        lr = LogisticRegression(C=config['lr.C'],
                                penalty=penalty,
                                solver=solver)
        estimators.append(('lr', lr))

    # AdaBoost
    if config['use_adaboost']:
        adaboost = AdaBoostClassifier(n_estimators=config['adaboost.n_estimators'],
                                      learning_rate=config['adaboost.learning_rate'])
        estimators.append(('adaboost', adaboost))

    # Create ensemble model
    if estimators:
        ensemble = VotingClassifier(estimators=estimators, voting='soft')
        ensemble.fit(X, y)
        # Here you'd typically evaluate your ensemble
        # Assuming 'evaluate_network_ray' is replaced with a suitable evaluation method for classifiers:
        evaluate_model_ray(X, y, ensemble)  # This needs to be defined
    else:
        # Handle case where no classifiers are enabled
        train.report({
            "roc_auc": 0,
            "pr_auc": 0,
            "f1_score": 0
        })


def train_model(config):
    train_ensemble(config)


search_space = {
    # Decision Tree Classifier
    "use_dt": tune.choice([True, False]),
    "dt.max_depth": tune.choice([5, 10, 15, 20, None]),
    "dt.min_samples_split": tune.choice([2, 5, 10]),

    # Support Vector Machine Classifier
    "use_svm": tune.choice([True, False]),
    "svm.C": tune.loguniform(0.1, 10),
    "svm.kernel": tune.choice(['linear', 'poly', 'rbf', 'sigmoid']),

    # Random Forest Classifier
    "use_rf": tune.choice([True, False]),
    "rf.n_estimators": tune.choice([10, 50, 100, 200]),
    "rf.max_depth": tune.choice([5, 10, 15, 20, None]),
    "rf.min_samples_split": tune.choice([2, 5, 10]),

    # Gaussian Naive Bayes
    "use_gnb": tune.choice([True, False]),
    "gnb.var_smoothing": tune.loguniform(1e-10, 1e-2),

    # Logistic Regression
    "use_lr": tune.choice([True, False]),
    "lr.C": tune.loguniform(0.1, 10),
    "lr.penalty": tune.choice(['l2','l1','elasticnet', None]),
    "lr.solver": tune.choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),

    # AdaBoost Classifier
    "use_adaboost": tune.choice([True, False]),
    "adaboost.n_estimators": tune.choice([50, 100, 200]),
    "adaboost.learning_rate": tune.loguniform(0.01, 1),
}


ray.init(num_cpus=8, ignore_reinit_error=True)

analysis = tune.run(train_model,
                    config=search_space,
                    num_samples=50,
                    max_failures=50,
                    search_alg=HEBOSearch(metric="f1_score", mode="max"),
                    scheduler=ASHAScheduler(metric="f1_score", mode="max"),
                    time_budget_s=3600,
                    resources_per_trial={"cpu": 1})

best_config = analysis.get_best_config(metric="f1_score", mode="max")
best_trial = analysis.get_best_trial(metric="f1_score", mode="max")

print("Best config:", best_config)
print("Best F1 Score:", best_trial.last_result["f1_score"])


2024-10-01 01:16:03,184	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2024-10-01 01:16:03,908	INFO worker.py:1619 -- Calling ray.init() again after it has already been called.
2024-10-01 01:16:03,910	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-10-01 01:19:56
Running for:,00:03:52.09
Memory:,19.9/47.0 GiB

Trial name,# failures,error file
train_model_d0fbc3bc,51,"/tmp/ray/session_2024-10-01_01-16-02_010623_90684/artifacts/2024-10-01_01-16-03/train_model_2024-10-01_01-16-03/driver_artifacts/train_model_d0fbc3bc_2_adaboost_learning_rate=0.0456,adaboost_n_estimators=100,dt_max_depth=20,dt_min_samples_split=2,gnb_var_smoo_2024-10-01_01-16-06/error.txt"

Trial name,status,loc,adaboost.learning_ra te,adaboost.n_estimator s,dt.max_depth,dt.min_samples_split,gnb.var_smoothing,lr.C,lr.penalty,lr.solver,rf.max_depth,rf.min_samples_split,rf.n_estimators,svm.C,svm.kernel,use_adaboost,use_dt,use_gnb,use_lr,use_rf,use_svm,iter,total time (s),roc_auc,pr_auc,f1_score
train_model_81f0876f,RUNNING,172.17.0.2:92077,0.173235,100,15.0,5,2.09842e-07,0.23763,,lbfgs,20.0,5,50,8.80945,linear,False,False,False,True,False,True,,,,,
train_model_1210ba1d,TERMINATED,172.17.0.2:91559,0.806988,50,15.0,10,3.55336e-06,0.174723,elasticnet,sag,15.0,5,10,0.157903,poly,False,True,False,False,False,True,1.0,145.644,0.806805,0.313694,0.117863
train_model_da56ff4c,TERMINATED,172.17.0.2:91727,0.189273,200,,5,0.00903815,0.654469,elasticnet,liblinear,,10,200,3.8384,sigmoid,False,False,False,True,True,False,1.0,21.1898,0.854088,0.375393,0.0554083
train_model_1365c1db,TERMINATED,172.17.0.2:91839,0.0105568,100,10.0,5,3.62707e-08,2.24022,l2,sag,5.0,5,50,0.930709,linear,True,True,True,False,False,True,1.0,154.608,0.826652,0.332604,0.173968
train_model_76cf1ba3,TERMINATED,172.17.0.2:91961,0.031057,200,10.0,5,0.000145376,8.15229,l2,saga,10.0,10,100,0.408269,sigmoid,True,True,True,False,True,False,1.0,48.2802,0.829937,0.337803,0.442621
train_model_0c87d5ef,TERMINATED,172.17.0.2:92192,0.0668481,50,20.0,10,2.0848e-05,1.07088,l1,lbfgs,20.0,5,50,1.70233,poly,True,False,True,True,False,True,1.0,120.744,0.837049,0.36998,0.229863
train_model_d3e35af5,TERMINATED,172.17.0.2:92309,0.377919,100,5.0,2,1.46316e-10,0.361696,elasticnet,liblinear,10.0,2,100,0.218049,rbf,False,True,False,False,True,False,1.0,7.00891,0.850254,0.365894,0.00150489
train_model_d0fbc3bc,ERROR,172.17.0.2:95080,0.0456242,100,20.0,2,9.22499e-09,5.20806,l1,newton-cg,15.0,2,100,2.35605,rbf,True,False,True,True,True,False,,,,,


2024-10-01 01:16:09,763	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_d0fbc3bc
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 2664, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=91618, ip=172.17.0.2, actor_id=28ea89d415

Trial name
train_model_0c87d5ef
train_model_1210ba1d
train_model_1365c1db
train_model_76cf1ba3
train_model_d0fbc3bc
train_model_d3e35af5
train_model_da56ff4c


[36m(train_model pid=91727)[0m Average ROC AUC: 0.8540876398993807
[36m(train_model pid=91727)[0m Average PR AUC: 0.3753925540098773
[36m(train_model pid=91727)[0m Average F1 score: 0.05540826957239117


2024-10-01 01:16:33,473	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_d0fbc3bc
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 2664, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=92242, ip=172.17.0.2, actor_id=7878cd04b6

[36m(train_model pid=92309)[0m Average ROC AUC: 0.8502538605583032
[36m(train_model pid=92309)[0m Average PR AUC: 0.3658940599345834
[36m(train_model pid=92309)[0m Average F1 score: 0.0015048908954100827


2024-10-01 01:16:44,364	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_d0fbc3bc
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 2664, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=92437, ip=172.17.0.2, actor_id=ff8aea70ab

[36m(train_model pid=91961)[0m Average ROC AUC: 0.8299369873594057
[36m(train_model pid=91961)[0m Average PR AUC: 0.33780280612338637
[36m(train_model pid=91961)[0m Average F1 score: 0.44262081107333656


2024-10-01 01:17:12,723	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_d0fbc3bc
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 2664, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=92826, ip=172.17.0.2, actor_id=16dcd98bd1

[36m(train_model pid=92192)[0m Average ROC AUC: 0.837049221826497
[36m(train_model pid=92192)[0m Average PR AUC: 0.36998037656556143
[36m(train_model pid=92192)[0m Average F1 score: 0.22986320029674878
[36m(train_model pid=91559)[0m Average ROC AUC: 0.806804793146085
[36m(train_model pid=91559)[0m Average PR AUC: 0.3136944961060616
[36m(train_model pid=91559)[0m Average F1 score: 0.11786258221719814


2024-10-01 01:18:33,714	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_d0fbc3bc
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 2664, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=94122, ip=172.17.0.2, actor_id=09c46745db

[36m(train_model pid=91839)[0m Average ROC AUC: 0.826652413559605
[36m(train_model pid=91839)[0m Average PR AUC: 0.3326035521229533
[36m(train_model pid=91839)[0m Average F1 score: 0.17396810392295112


2024-10-01 01:18:50,671	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_d0fbc3bc
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 2664, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=94425, ip=172.17.0.2, actor_id=36a946fc8f