In [8]:
from pathlib import Path

import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_validate

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import mlflow
from optuna.integration import MLflowCallback

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [9]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
#Path(MODEL_REGISTRY).mkdir(exist_ok=True)  # create experiments dir
mlflow.set_tracking_uri("file:///" + str(MODEL_REGISTRY.absolute()))

In [3]:
df = pd.read_csv("../data/proccessed/globalterrordb_proccesed.csv", index_col=0)

In [11]:
train = pd.read_csv("../data/modeling/train.csv")
test = pd.read_csv("../data/modeling/test.csv")

In [12]:
X_train = train.drop("cas_class", axis=1)
y_train = train["cas_class"]
X_test = test.drop("cas_class", axis=1)
y_test = test["cas_class"]

In [4]:
# df.fillna(value=-9, inplace=True)

Podział danych

In [5]:
# df.drop(["countries_count", "group_count"], axis=1, inplace=True)

In [6]:
# y = df["cas_class"]
# X = df.drop("cas_class", axis=1)

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [45]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, test_size=0.25, stratify=y_train)

Transformacje danych

In [8]:
# impute_value = -9

In [9]:
# pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=impute_value),
#                          StandardScaler())

In [10]:
# pipeline.fit_transform(X_train)

array([[-0.20454128,  0.96113541,  0.55877667, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -0.45858041,  0.37822996, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -1.52336728, -3.19162718, ..., -1.01618403,
        -0.14801657, -1.08281702],
       ...,
       [-0.20454128, -2.2332252 ,  0.96673904, ...,  1.14117949,
         1.97519523,  1.0606368 ],
       [-0.20454128, -1.87829624, -0.53769926, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128,  0.60620645,  1.48783997, ..., -1.01618403,
        -0.14801657, -1.08281702]])

In [11]:
# pipeline.transform(X_val)
# pipeline.transform(X_test)

array([[-0.20454128,  0.2512775 ,  1.11346144, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -1.52336728, -2.05252351, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -0.45858041,  0.01885846, ..., -1.01618403,
        -0.14801657, -1.08281702],
       ...,
       [-0.20454128,  1.67099332, -3.10661319, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -1.52336728, -1.18432961, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -0.45858041,  0.45137931, ..., -1.01618403,
        -0.14801657, -1.08281702]])

Wyszukiwanie hiperparametrów w regresji logistycznej

In [38]:
def objective(trial):
    params = {
        "C": trial.suggest_loguniform("C", 1e-2, 1),
        "tol": trial.suggest_uniform("tol", 1e-6, 1e-3),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "max_iter": trial.suggest_categorical("max_iter", [500])
    }

    model = LogisticRegression(**params)
    # model.fit(X_train, y_train)
    # preds = model.predict(X_val)
    #
    # accuracy = accuracy_score(y_val, preds)
    # precision = precision_score(y_val, preds)
    # recall = recall_score(y_val, preds)
    # f1 = f1_score(y_val, preds)
    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_f1"])

    return accuracy, precision, recall, f1


In [39]:
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name=["accuracy", "precision", "recall", "f1"])

  mlflow_callback = MLflowCallback(


In [40]:
study = optuna.create_study(study_name="lr", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=10, callbacks=[mlflow_callback])

[32m[I 2022-05-30 20:54:33,857][0m A new study created in memory with name: lr[0m
[32m[I 2022-05-30 20:54:37,120][0m Trial 0 finished with values: [0.7640465661943868, 0.8232097786958384, 0.7612834924333025, 0.7910334237926059] and parameters: {'C': 1.0510372293460541, 'class_weight': 'balanced', 'max_iter': 500}. [0m
[32m[I 2022-05-30 20:54:39,534][0m Trial 1 finished with values: [0.7640375417381103, 0.8231959627757112, 0.7612834924333025, 0.7910270764201621] and parameters: {'C': 3.00221789283994, 'class_weight': 'balanced', 'max_iter': 500}. [0m
[32m[I 2022-05-30 20:54:41,451][0m Trial 2 finished with values: [0.7640646151069398, 0.8232263934551382, 0.7612988758653461, 0.7910493664907183] and parameters: {'C': 4.931839361553541, 'class_weight': 'balanced', 'max_iter': 500}. [0m
[32m[I 2022-05-30 20:54:42,861][0m Trial 3 finished with values: [0.7640646151069398, 0.8232261893502141, 0.7612988758653461, 0.7910493455073438] and parameters: {'C': 5.331678115985939, 'class

Wyszukiwanie hiperparametrów drzewo decyzyjne

In [36]:
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 10, 40),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 50),
        "max_features": trial.suggest_categorical("max_features", [None]),
        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
    }

    model = DecisionTreeClassifier(**params)
    # model.fit(X_train, y_train)
    # preds = model.predict(X_val)

    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_f1"])

    # accuracy = accuracy_score(y_val, preds)
    # precision = precision_score(y_val, preds)
    # recall = recall_score(y_val, preds)
    # f1 = f1_score(y_val, preds)

    return accuracy, precision, recall, f1

In [37]:
study = optuna.create_study(study_name="dt", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=50, callbacks=[mlflow_callback])

[32m[I 2022-05-30 20:47:50,643][0m A new study created in memory with name: dt[0m
[32m[I 2022-05-30 20:47:53,456][0m Trial 0 finished with values: [0.8497969497337785, 0.8697183309725827, 0.8750423479192098, 0.8497969497337785] and parameters: {'max_depth': 31, 'min_samples_leaf': 15, 'max_features': None, 'class_weight': None}. [0m
[32m[I 2022-05-30 20:47:55,517][0m Trial 1 finished with values: [0.8470986373071021, 0.887302662736327, 0.8469371370282935, 0.8470986373071021] and parameters: {'max_depth': 24, 'min_samples_leaf': 30, 'max_features': None, 'class_weight': 'balanced'}. [0m
[32m[I 2022-05-30 20:47:57,142][0m Trial 2 finished with values: [0.8408627380200343, 0.8878618520335539, 0.8341536043853359, 0.8408627380200343] and parameters: {'max_depth': 13, 'min_samples_leaf': 36, 'max_features': None, 'class_weight': 'balanced'}. [0m
[32m[I 2022-05-30 20:47:58,172][0m Trial 3 finished with values: [0.8488313329121919, 0.8657314563942877, 0.8785804497354995, 0.848831

In [16]:
print(mlflow.get_tracking_uri())

file:///C:\Users\Tuszyn\Desktop\JT_praca_magisterska\notebooks\experiments
