In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import ray
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from evaluators import evaluate_model_ray
from ray.tune.search.hebo import HEBOSearch
from ray.train import CheckpointConfig


load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm
2024-10-01 01:20:07,185	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-10-01 01:20:07,315	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


True

In [2]:
# Assuming feature_df and targets_df are already defined
data_dir = os.getenv("DATA")
encoded_df = pd.read_csv(Path(data_dir) / "encoded_df.csv")
targets_df = pd.read_csv(Path(data_dir) / "target.csv")
encoded_df.head()

Unnamed: 0,id,Geschlecht,Alter,Fahrerlaubnis,Vorversicherung,Alter_Fzg,Vorschaden,Jahresbeitrag,Kundentreue,Regional_Code_0,...,Vertriebskanal_152.0,Vertriebskanal_153.0,Vertriebskanal_154.0,Vertriebskanal_155.0,Vertriebskanal_156.0,Vertriebskanal_157.0,Vertriebskanal_158.0,Vertriebskanal_159.0,Vertriebskanal_160.0,Vertriebskanal_163.0
0,1.0,1.0,44.0,1.0,0.0,2.0,1.0,40454.0,217.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,1.0,76.0,1.0,0.0,0.0,0.0,33536.0,183.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1.0,47.0,1.0,0.0,2.0,1.0,38294.0,27.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,1.0,21.0,1.0,1.0,1.0,0.0,28619.0,203.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,29.0,1.0,1.0,1.0,0.0,27496.0,39.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sub_encoded_df = encoded_df.sample(n=10000, random_state=42)

normal_df = pd.DataFrame(scaler.fit_transform(sub_encoded_df.drop(columns="id")),
                         columns=encoded_df.columns.difference(["id"]))

In [4]:
from sklearn.preprocessing import PolynomialFeatures

# Generate polynomial features (degree 2 is a common starting point)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(sub_encoded_df)
cols = poly.get_feature_names_out(sub_encoded_df.columns)
poly_df = pd.DataFrame(poly.transform(sub_encoded_df.copy()), columns=cols)

In [5]:
ids = sub_encoded_df.id
y = targets_df.set_index("id").loc[ids]["Interesse"].values
X = normal_df.values
X.shape

(10000, 216)

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [7]:
X_id = ray.put(X)
y_id = ray.put(y)


def train_ensemble(config):
    X = ray.get(X_id)
    y = ray.get(y_id)
    estimators = []

    # Decision Tree
    if config['use_dt']:
        dt = DecisionTreeClassifier(max_depth=config['dt.max_depth'],
                                    min_samples_split=config['dt.min_samples_split'])
        estimators.append(('dt', dt))

    # SVM
    if config['use_svm']:
        svm = SVC(C=config['svm.C'], kernel=config['svm.kernel'], probability=True)
        estimators.append(('svm', svm))

    # Random Forest
    if config['use_rf']:
        rf = RandomForestClassifier(n_estimators=config['rf.n_estimators'],
                                    max_depth=config['rf.max_depth'],
                                    min_samples_split=config['rf.min_samples_split'])
        estimators.append(('rf', rf))

    # Gaussian Naive Bayes
    if config['use_gnb']:
        gnb = GaussianNB(var_smoothing=config['gnb.var_smoothing'])
        estimators.append(('gnb', gnb))

    # Logistic Regression
    if config['use_lr']:
        solver = config['lr.solver']
        penalty = config['lr.penalty']
        if solver == 'liblinear' and penalty == None:
            penalty = 'l2'
        elif solver in ["lbfgs", "newton-cg"] and penalty not in ['l2', None]:
            penalty = 'l2'
        if penalty == "elasticnet" and solver not in ['saga', 'sag']:
            penalty = 'l2'
        lr = LogisticRegression(C=config['lr.C'],
                                penalty=penalty,
                                solver=solver)
        estimators.append(('lr', lr))

    # AdaBoost
    if config['use_adaboost']:
        adaboost = AdaBoostClassifier(n_estimators=config['adaboost.n_estimators'],
                                      learning_rate=config['adaboost.learning_rate'])
        estimators.append(('adaboost', adaboost))

    # Create ensemble model
    if estimators:
        ensemble = VotingClassifier(estimators=estimators, voting='soft')
        ensemble.fit(X, y)
        # Here you'd typically evaluate your ensemble
        # Assuming 'evaluate_network_ray' is replaced with a suitable evaluation method for classifiers:
        evaluate_model_ray(X, y, ensemble)  # This needs to be defined
    else:
        # Handle case where no classifiers are enabled
        train.report({
            "roc_auc": 0,
            "pr_auc": 0,
            "f1_score": 0
        })


def train_model(config):
    train_ensemble(config)


search_space = {
    # Decision Tree Classifier
    "use_dt": tune.choice([True, False]),
    "dt.max_depth": tune.choice([5, 10, 15, 20, None]),
    "dt.min_samples_split": tune.choice([2, 5, 10]),

    # Support Vector Machine Classifier
    "use_svm": tune.choice([True, False]),
    "svm.C": tune.loguniform(0.1, 10),
    "svm.kernel": tune.choice(['linear', 'poly', 'rbf', 'sigmoid']),

    # Random Forest Classifier
    "use_rf": tune.choice([True, False]),
    "rf.n_estimators": tune.choice([10, 50, 100, 200]),
    "rf.max_depth": tune.choice([5, 10, 15, 20, None]),
    "rf.min_samples_split": tune.choice([2, 5, 10]),

    # Gaussian Naive Bayes
    "use_gnb": tune.choice([True, False]),
    "gnb.var_smoothing": tune.loguniform(1e-10, 1e-2),

    # Logistic Regression
    "use_lr": tune.choice([True, False]),
    "lr.C": tune.loguniform(0.1, 10),
    "lr.penalty": tune.choice(['l2','l1','elasticnet', None]),
    "lr.solver": tune.choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),

    # AdaBoost Classifier
    "use_adaboost": tune.choice([True, False]),
    "adaboost.n_estimators": tune.choice([50, 100, 200]),
    "adaboost.learning_rate": tune.loguniform(0.01, 1),
}


ray.init(num_cpus=8, ignore_reinit_error=True)

analysis = tune.run(train_model,
                    config=search_space,
                    num_samples=50,
                    storage_path=Path(os.getenv("WORKINGDIR"), "artifacts"),
                    max_failures=50,
                    search_alg=HEBOSearch(metric="f1_score", mode="max"),
                    scheduler=ASHAScheduler(metric="f1_score", mode="max"),
                    time_budget_s=3600,
                    checkpoint_config=CheckpointConfig(
                        num_to_keep=4,
                        checkpoint_score_attribute="f1_score",
                        checkpoint_score_order='max',
                    ),
                    resources_per_trial={"cpu": 1})

best_config = analysis.get_best_config(metric="f1_score", mode="max")
best_trial = analysis.get_best_trial(metric="f1_score", mode="max")

print("Best config:", best_config)
print("Best F1 Score:", best_trial.last_result["f1_score"])


2024-10-01 01:20:14,258	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2024-10-01 01:20:14,945	INFO worker.py:1619 -- Calling ray.init() again after it has already been called.
2024-10-01 01:20:14,948	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-10-01 01:43:59
Running for:,00:23:44.63
Memory:,20.5/47.0 GiB

Trial name,status,loc,adaboost.learning_ra te,adaboost.n_estimator s,dt.max_depth,dt.min_samples_split,gnb.var_smoothing,lr.C,lr.penalty,lr.solver,rf.max_depth,rf.min_samples_split,rf.n_estimators,svm.C,svm.kernel,use_adaboost,use_dt,use_gnb,use_lr,use_rf,use_svm,iter,total time (s),roc_auc,pr_auc,f1_score
train_model_43658677,RUNNING,172.17.0.2:96550,0.174294,200,,5,0.0060463,0.137112,elasticnet,lbfgs,10.0,2,200,9.02593,linear,True,False,True,True,True,True,,,,,
train_model_014c9fed,TERMINATED,172.17.0.2:96135,0.0120783,200,15.0,5,0.000136919,1.11397,l1,sag,20.0,2,100,0.960303,poly,False,True,False,False,True,True,1.0,156.831,0.83224,0.339621,0.0804266
train_model_a505ace0,TERMINATED,172.17.0.2:96194,0.234154,100,15.0,5,9.98792e-07,0.549289,elasticnet,liblinear,5.0,5,10,3.244,rbf,True,False,True,True,False,False,1.0,20.943,0.836267,0.366107,0.411677
train_model_e6cbeacd,TERMINATED,172.17.0.2:96309,0.0631298,50,20.0,10,1.13528e-05,7.16202,l1,lbfgs,10.0,5,50,1.08261,sigmoid,False,False,False,True,False,False,1.0,0.867354,0.847254,0.367282,0.112832
train_model_6a7d61be,TERMINATED,172.17.0.2:96365,0.335903,100,10.0,2,6.68118e-10,0.26298,,saga,15.0,10,100,0.277508,poly,True,True,True,False,True,True,1.0,194.102,0.838391,0.347067,0.133305
train_model_78822d89,TERMINATED,172.17.0.2:96431,0.790777,50,10.0,10,7.84271e-06,0.913627,,sag,20.0,5,50,0.106706,rbf,True,True,True,False,False,False,1.0,12.1491,0.823772,0.33622,0.414145
train_model_15f544e2,TERMINATED,172.17.0.2:96491,0.0485039,100,20.0,2,1.68163e-09,1.85326,l1,newton-cg,15.0,10,100,3.12322,poly,False,False,False,True,True,True,1.0,178.139,0.853657,0.383776,0.00737485
train_model_459b68bf,TERMINATED,172.17.0.2:96617,0.0293419,100,5.0,5,1.21051e-08,3.73334,l2,liblinear,,5,10,0.356079,rbf,False,True,False,False,False,False,1.0,0.491547,0.837035,0.332477,0.00589086
train_model_1eb6cdce,TERMINATED,172.17.0.2:96681,0.0376826,50,10.0,10,4.86248e-05,0.10579,elasticnet,lbfgs,20.0,10,10,0.463819,rbf,True,False,True,True,True,False,1.0,17.0686,0.840388,0.368917,0.435631
train_model_9370e5f6,TERMINATED,172.17.0.2:96745,0.704636,100,20.0,2,2.81087e-10,5.19229,l1,saga,5.0,5,200,6.68573,poly,False,True,False,False,False,True,1.0,168.546,0.781142,0.313348,0.245783




[36m(train_model pid=96309)[0m Average ROC AUC: 0.8472539686152497
[36m(train_model pid=96309)[0m Average PR AUC: 0.3672817623977151
[36m(train_model pid=96309)[0m Average F1 score: 0.11283218628407994


Trial name,f1_score,pr_auc,roc_auc
train_model_014c9fed,0.0804266,0.339621,0.83224
train_model_15f544e2,0.00737485,0.383776,0.853657
train_model_1eb6cdce,0.435631,0.368917,0.840388
train_model_459b68bf,0.00589086,0.332477,0.837035
train_model_6a7d61be,0.133305,0.347067,0.838391
train_model_78822d89,0.414145,0.33622,0.823772
train_model_9370e5f6,0.245783,0.313348,0.781142
train_model_a505ace0,0.411677,0.366107,0.836267
train_model_c1daa126,0.0189461,0.377429,0.853928
train_model_d96db4cb,0.309246,0.320484,0.811956




[36m(train_model pid=96431)[0m Average ROC AUC: 0.8237715087597192
[36m(train_model pid=96431)[0m Average PR AUC: 0.336220363824213
[36m(train_model pid=96431)[0m Average F1 score: 0.41414473271597085
[36m(train_model pid=96194)[0m Average ROC AUC: 0.8362672062415051
[36m(train_model pid=96194)[0m Average PR AUC: 0.36610673202847654
[36m(train_model pid=96194)[0m Average F1 score: 0.41167740117237583




[36m(train_model pid=96681)[0m Average ROC AUC: 0.8403875998181002[32m [repeated 2x across cluster][0m
[36m(train_model pid=96681)[0m Average PR AUC: 0.3689170595699231[32m [repeated 2x across cluster][0m
[36m(train_model pid=96681)[0m Average F1 score: 0.4356312686349659[32m [repeated 2x across cluster][0m
[36m(train_model pid=96872)[0m Average ROC AUC: 0.8539277111417424
[36m(train_model pid=96872)[0m Average PR AUC: 0.37742920000513697
[36m(train_model pid=96872)[0m Average F1 score: 0.018946109108273337




[36m(train_model pid=96135)[0m Average ROC AUC: 0.8322404752364951
[36m(train_model pid=96135)[0m Average PR AUC: 0.33962086646097306
[36m(train_model pid=96135)[0m Average F1 score: 0.08042661264072919




[36m(train_model pid=96491)[0m Average ROC AUC: 0.8536570958204605
[36m(train_model pid=96491)[0m Average PR AUC: 0.3837758582768098
[36m(train_model pid=96491)[0m Average F1 score: 0.007374848333218868




[36m(train_model pid=96745)[0m Average ROC AUC: 0.7811423370617115
[36m(train_model pid=96745)[0m Average PR AUC: 0.313348398502391
[36m(train_model pid=96745)[0m Average F1 score: 0.2457828680012147




[36m(train_model pid=96808)[0m Average ROC AUC: 0.8119555146587437[32m [repeated 2x across cluster][0m
[36m(train_model pid=96808)[0m Average PR AUC: 0.3204840304786299[32m [repeated 2x across cluster][0m
[36m(train_model pid=96808)[0m Average F1 score: 0.3092463564301832[32m [repeated 2x across cluster][0m
