In [1]:
from pathlib import Path

import optuna
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import mlflow
from optuna.integration import MLflowCallback

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
#Path(MODEL_REGISTRY).mkdir(exist_ok=True)  # create experiments dir
mlflow.set_tracking_uri("file:///" + str(MODEL_REGISTRY.absolute()))

In [3]:
df = pd.read_csv("../data/proccessed/globalterrordb_proccesed.csv", index_col=0)

Podział danych

In [3]:
train = pd.read_csv("../data/modeling/train.csv")
test = pd.read_csv("../data/modeling/test.csv")

In [4]:
X_train = train.drop("cas_class", axis=1)
y_train = train["cas_class"]
X_test = test.drop("cas_class", axis=1)
y_test = test["cas_class"]

In [4]:
# df.fillna(value=-9, inplace=True)

In [5]:
# df.drop(["countries_count", "group_count"], axis=1, inplace=True)

In [6]:
# y = df["cas_class"]
# X = df.drop("cas_class", axis=1)

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [45]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, test_size=0.25, stratify=y_train)

Transformacje danych

In [8]:
# impute_value = -9

In [9]:
# pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=impute_value),
#                          StandardScaler())

In [10]:
# pipeline.fit_transform(X_train)

array([[-0.20454128,  0.96113541,  0.55877667, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -0.45858041,  0.37822996, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -1.52336728, -3.19162718, ..., -1.01618403,
        -0.14801657, -1.08281702],
       ...,
       [-0.20454128, -2.2332252 ,  0.96673904, ...,  1.14117949,
         1.97519523,  1.0606368 ],
       [-0.20454128, -1.87829624, -0.53769926, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128,  0.60620645,  1.48783997, ..., -1.01618403,
        -0.14801657, -1.08281702]])

In [11]:
# pipeline.transform(X_val)
# pipeline.transform(X_test)

array([[-0.20454128,  0.2512775 ,  1.11346144, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -1.52336728, -2.05252351, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -0.45858041,  0.01885846, ..., -1.01618403,
        -0.14801657, -1.08281702],
       ...,
       [-0.20454128,  1.67099332, -3.10661319, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -1.52336728, -1.18432961, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -0.45858041,  0.45137931, ..., -1.01618403,
        -0.14801657, -1.08281702]])

Wyszukiwanie hiperparametrów w regresji logistycznej

In [5]:
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name=["accuracy", "precision", "recall", "f1"])

  mlflow_callback = MLflowCallback(


Wyszukiwanie hiperparametrów dla maszyn wektorów nośnych

In [14]:
def objective(trial):
    params = {
        "C": trial.suggest_float("C", 0.0001, 0.1),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "dual": trial.suggest_categorical("dual", [False]),
        "penalty": trial.suggest_categorical("penalty", ["l1"])
    }

    model = LinearSVC(**params)

    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_f1"])

    return accuracy, precision, recall, f1

In [15]:
study = optuna.create_study(study_name="support_vector_classifier",
                            directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=100, callbacks=[mlflow_callback])

[32m[I 2022-05-31 11:42:38,671][0m Trial 92 finished with values: [0.7640555906506632, 0.8230386717771919, 0.7615450095948864, 0.7910969697893417] and parameters: {'C': 0.40202876055721787, 'class_weight': 'balanced', 'dual': False, 'penalty': 'l1'}. [0m
[32m[I 2022-05-31 11:42:40,708][0m Trial 93 finished with values: [0.7640646151069397, 0.8230525019674216, 0.7615450107780452, 0.7911033429851506] and parameters: {'C': 0.30421067375306216, 'class_weight': 'balanced', 'dual': False, 'penalty': 'l2'}. [0m
[32m[I 2022-05-31 11:42:42,621][0m Trial 94 finished with values: [0.7640555906506632, 0.8230495433002503, 0.7615296273460015, 0.7910936824189908] and parameters: {'C': 0.6124182299946374, 'class_weight': 'balanced', 'dual': False, 'penalty': 'l2'}. [0m
[32m[I 2022-05-31 11:42:49,405][0m Trial 95 finished with values: [0.7640916884757694, 0.8230719357298424, 0.7615757776421325, 0.7911289568482511] and parameters: {'C': 0.6675357020811684, 'class_weight': 'balanced', 'dual': 

Wyszukwianie hiperparametrów dla regresji logistycznej

In [10]:
def objective(trial):
    params = {
        "C": trial.suggest_loguniform("C", 0.1, 10),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "max_iter": trial.suggest_categorical("max_iter", [500])
    }

    model = LogisticRegression(**params)

    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_f1"])

    return accuracy, precision, recall, f1


In [11]:
study = optuna.create_study(study_name="logistic_regression",
                            directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=20, callbacks=[mlflow_callback])

[32m[I 2022-05-31 11:21:34,961][0m A new study created in memory with name: logistic_regression[0m
[32m[I 2022-05-31 11:21:38,228][0m Trial 0 finished with values: [0.7640736395632163, 0.8232505310795475, 0.7612834924333025, 0.7910523205943465] and parameters: {'C': 4.127485834816929, 'class_weight': 'balanced', 'max_iter': 500}. [0m
[32m[I 2022-05-31 11:21:40,722][0m Trial 1 finished with values: [0.7640375417381103, 0.8231959627757112, 0.7612834924333025, 0.7910270764201621] and parameters: {'C': 9.035317495964168, 'class_weight': 'balanced', 'max_iter': 500}. [0m
[32m[I 2022-05-31 11:21:42,449][0m Trial 2 finished with values: [0.7640465661943867, 0.823209507391752, 0.7612834924333025, 0.7910333391047828] and parameters: {'C': 0.2758855819905255, 'class_weight': 'balanced', 'max_iter': 500}. [0m
[32m[I 2022-05-31 11:21:44,069][0m Trial 3 finished with values: [0.7640465661943867, 0.8231989133138761, 0.7612988746821873, 0.7910366775184539] and parameters: {'C': 0.140069

Wyszukiwanie hiperparametrów drzewo decyzyjne

In [83]:
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 15, 50),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 40),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"])
    }

    model = DecisionTreeClassifier(**params)

    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_f1"])

    return accuracy, precision, recall, f1

In [84]:
study = optuna.create_study(study_name="decision_tree", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=20, callbacks=[mlflow_callback])

[32m[I 2022-05-30 22:17:49,184][0m A new study created in memory with name: decision_tree[0m
[32m[I 2022-05-30 22:17:53,309][0m Trial 0 finished with values: [0.8379929609241044, 0.8664138853143701, 0.8558132240010412, 0.8610735043123647] and parameters: {'max_depth': 47, 'min_samples_leaf': 3, 'class_weight': 'balanced', 'criterion': 'gini'}. [0m
[32m[I 2022-05-30 22:17:55,657][0m Trial 1 finished with values: [0.8463135096110459, 0.8880286157415048, 0.8445219268041602, 0.8657194570056808] and parameters: {'max_depth': 15, 'min_samples_leaf': 13, 'class_weight': 'balanced', 'criterion': 'gini'}. [0m
[32m[I 2022-05-30 22:17:57,140][0m Trial 2 finished with values: [0.846512047649129, 0.8915249644477559, 0.8406607622495947, 0.8653344094614086] and parameters: {'max_depth': 16, 'min_samples_leaf': 20, 'class_weight': 'balanced', 'criterion': 'gini'}. [0m
[32m[I 2022-05-30 22:17:59,144][0m Trial 3 finished with values: [0.8481003519537949, 0.8872340326220195, 0.8489830747932

Wyszukiwanie hiperparametrów lasu losowego

In [17]:
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 30, 60),
        #"max_depth": trial.suggest_categorical("max_depth", [None]),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 30),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_features": trial.suggest_categorical("max_features", [None, "sqrt"]),
        "n_estimators": trial.suggest_int("n_estimators", 250, 1000)
    }

    model = RandomForestClassifier(**params)

    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-2, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_f1"])

    return accuracy, precision, recall, f1

In [None]:
study = optuna.create_study(study_name="random_forest", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=10, callbacks=[mlflow_callback])

[32m[I 2022-05-31 10:54:40,987][0m A new study created in memory with name: random_forest[0m
[32m[I 2022-05-31 10:56:18,639][0m Trial 0 finished with values: [0.8754534789278946, 0.8934953523811018, 0.8943020179449472, 0.8938958740893144] and parameters: {'max_depth': 51, 'min_samples_split': 8, 'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 821}. [0m
[32m[I 2022-05-31 11:01:57,781][0m Trial 1 finished with values: [0.8697229491923112, 0.8968365132649458, 0.8790573384951716, 0.8878540178450292] and parameters: {'max_depth': 53, 'min_samples_split': 29, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': None, 'n_estimators': 710}. [0m
[32m[I 2022-05-31 11:03:47,059][0m Trial 2 finished with values: [0.8732605360527028, 0.8965590105190341, 0.8862104782185118, 0.8913525787491737] and parameters: {'max_depth': 33, 'min_samples_split': 17, 'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'sqrt', 'n_estimat

Wyszukwianie hiperparametrów dla XGBoost