In [15]:
import os
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import ray
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from evaluators import evaluate_model_ray
from ray.tune.search.hebo import HEBOSearch
from ray.train import CheckpointConfig


load_dotenv()


True

In [16]:
# Assuming feature_df and targets_df are already defined
data_dir = os.getenv("DATA")
encoded_df = pd.read_csv(Path(data_dir) / "encoded_df.csv")
targets_df = pd.read_csv(Path(data_dir) / "target.csv")
encoded_df.head()

Unnamed: 0,id,Geschlecht,Alter,Fahrerlaubnis,Vorversicherung,Alter_Fzg,Vorschaden,Jahresbeitrag,Kundentreue,Regional_Code_0,...,Vertriebskanal_152.0,Vertriebskanal_153.0,Vertriebskanal_154.0,Vertriebskanal_155.0,Vertriebskanal_156.0,Vertriebskanal_157.0,Vertriebskanal_158.0,Vertriebskanal_159.0,Vertriebskanal_160.0,Vertriebskanal_163.0
0,1.0,1.0,44.0,1.0,0.0,2.0,1.0,40454.0,217.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,1.0,76.0,1.0,0.0,0.0,0.0,33536.0,183.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1.0,47.0,1.0,0.0,2.0,1.0,38294.0,27.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,1.0,21.0,1.0,1.0,1.0,0.0,28619.0,203.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,29.0,1.0,1.0,1.0,0.0,27496.0,39.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sub_encoded_df = encoded_df.sample(n=10000, random_state=42)

normal_df = pd.DataFrame(scaler.fit_transform(sub_encoded_df.drop(columns="id")),
                         columns=encoded_df.columns.difference(["id"]))

In [18]:
from sklearn.preprocessing import PolynomialFeatures

# Generate polynomial features (degree 2 is a common starting point)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(sub_encoded_df)
cols = poly.get_feature_names_out(sub_encoded_df.columns)
poly_df = pd.DataFrame(poly.transform(sub_encoded_df.copy()), columns=cols)

In [19]:
ids = sub_encoded_df.id
y = targets_df.set_index("id").loc[ids]["Interesse"].values
X = normal_df.values
X.shape

(10000, 216)

In [20]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [21]:
X_id = ray.put(X)
y_id = ray.put(y)


def train_ensemble(config):
    X = ray.get(X_id)
    y = ray.get(y_id)
    estimators = []

    # Decision Tree
    if config['use_dt']:
        dt = DecisionTreeClassifier(max_depth=config['dt.max_depth'],
                                    min_samples_split=config['dt.min_samples_split'])
        estimators.append(('dt', dt))

    # SVM
    if config['use_svm']:
        svm = SVC(C=config['svm.C'], kernel=config['svm.kernel'], probability=True)
        estimators.append(('svm', svm))

    # Random Forest
    if config['use_rf']:
        rf = RandomForestClassifier(n_estimators=config['rf.n_estimators'],
                                    max_depth=config['rf.max_depth'],
                                    min_samples_split=config['rf.min_samples_split'])
        estimators.append(('rf', rf))

    # Gaussian Naive Bayes
    if config['use_gnb']:
        gnb = GaussianNB(var_smoothing=config['gnb.var_smoothing'])
        estimators.append(('gnb', gnb))

    # Logistic Regression
    if config['use_lr']:
        # https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html
        solver = config['lr.solver']
        penalty = config['lr.penalty']
        l1_ratio = config['lr.l1_ratio']
        ChildProcessError
        if solver == 'liblinear' and penalty == None:
            penalty = 'l2'
        elif solver in ["lbfgs", "newton-cg", 'sag'] and penalty not in ['l2', None]:
            penalty = 'l2'
        elif solver == "liblinear" and penalty is None:
            penalty = 'l2'
        if penalty == "elasticnet" and solver not in ['saga']:
            penalty = 'l2'
            l1_ratio = None
        lr = LogisticRegression(C=config['lr.C'],
                                penalty=penalty,
                                solver=solver,
                                l1_ratio=l1_ratio)
        estimators.append(('lr', lr))

    # AdaBoost
    if config['use_adaboost']:
        adaboost = AdaBoostClassifier(n_estimators=config['adaboost.n_estimators'],
                                      learning_rate=config['adaboost.learning_rate'])
        estimators.append(('adaboost', adaboost))

    # Create ensemble model
    if estimators:
        ensemble = VotingClassifier(estimators=estimators, voting='soft')
        ensemble.fit(X, y)
        # Here you'd typically evaluate your ensemble
        # Assuming 'evaluate_network_ray' is replaced with a suitable evaluation method for classifiers:
        evaluate_model_ray(X, y, ensemble)  # This needs to be defined
    else:
        # Handle case where no classifiers are enabled
        train.report({
            "roc_auc": 0,
            "pr_auc": 0,
            "f1_score": 0
        })


def train_model(config):
    train_ensemble(config)


search_space = {
    # Decision Tree Classifier
    "use_dt": tune.choice([True, False]),
    "dt.max_depth": tune.choice([5, 10, 15, 20, None]),
    "dt.min_samples_split": tune.choice([2, 5, 10]),

    # Support Vector Machine Classifier
    "use_svm": tune.choice([True, False]),
    "svm.C": tune.loguniform(0.1, 10),
    "svm.kernel": tune.choice(['linear', 'poly', 'rbf', 'sigmoid']),

    # Random Forest Classifier
    "use_rf": tune.choice([True, False]),
    "rf.n_estimators": tune.choice([10, 50, 100, 200]),
    "rf.max_depth": tune.choice([5, 10, 15, 20, None]),
    "rf.min_samples_split": tune.choice([2, 5, 10]),

    # Gaussian Naive Bayes
    "use_gnb": tune.choice([True, False]),
    "gnb.var_smoothing": tune.loguniform(1e-10, 1e-2),

    # Logistic Regression
    "use_lr": tune.choice([True, False]),
    "lr.C": tune.loguniform(0.1, 10),
    "lr.penalty": tune.choice(['l2', 'l1', 'elasticnet', None]),
    "lr.solver": tune.choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
    "lr.l1_ratio": tune.loguniform(0.0, 1.0),

    # AdaBoost Classifier
    "use_adaboost": tune.choice([True, False]),
    "adaboost.n_estimators": tune.choice([50, 100, 200]),
    "adaboost.learning_rate": tune.loguniform(0.01, 1),
}


ray.init(num_cpus=8, ignore_reinit_error=True)

analysis = tune.run(train_model,
                    name="garbadge_ensemble",
                    config=search_space,
                    num_samples=300,
                    storage_path=Path(os.getenv("WORKINGDIR"), "artifacts"),
                    max_failures=300,
                    search_alg=HEBOSearch(metric="f1_score", mode="max"),
                    scheduler=ASHAScheduler(metric="f1_score", mode="max"),
                    time_budget_s=300,
                    checkpoint_config=CheckpointConfig(
                        num_to_keep=4,
                        checkpoint_score_attribute="f1_score",
                        checkpoint_score_order='max',
                    ),
                    resources_per_trial={"cpu": 1})

best_config = analysis.get_best_config(metric="f1_score", mode="max")
best_trial = analysis.get_best_trial(metric="f1_score", mode="max")

print("Best config:", best_config)
print("Best F1 Score:", best_trial.last_result["f1_score"])


2024-10-01 06:34:12,125	INFO worker.py:1619 -- Calling ray.init() again after it has already been called.
2024-10-01 06:34:12,126	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


Trial name,f1_score,pr_auc,roc_auc
train_model_08226fa5,0.210675,0.284604,0.760274
train_model_0abc4f2f,0.0959459,0.368658,0.848391
train_model_0cd930d6,0.415496,0.366767,0.835213
train_model_2dbe6828,0.00595583,0.345249,0.843501
train_model_37510bb5,0.400821,0.341881,0.823114
train_model_4630a0ff,0.142185,0.3409,0.828884
train_model_8ded96a4,0.270352,0.355614,0.841213
train_model_ae6d82d9,0.391971,0.334022,0.825959


2024-10-01 06:35:08,338	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_689c42f2
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 2664, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/opt/conda/lib/python3.9/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=87302, ip=172.17.0.2, actor_id=26e396e8a5