In [1]:
from pathlib import Path

import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_validate

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import mlflow
from optuna.integration import MLflowCallback

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
#Path(MODEL_REGISTRY).mkdir(exist_ok=True)  # create experiments dir
mlflow.set_tracking_uri("file:///" + str(MODEL_REGISTRY.absolute()))

In [3]:
df = pd.read_csv("../data/proccessed/globalterrordb_proccesed.csv", index_col=0)

In [4]:
df.fillna(value=-9, inplace=True)

Podział danych

In [5]:
df.drop(["countries_count", "group_count"], axis=1, inplace=True)

In [6]:
y = df["cas_class"]
X = df.drop("cas_class", axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [45]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, test_size=0.25, stratify=y_train)

Transformacje danych

In [8]:
impute_value = -9

In [9]:
pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=impute_value),
                         StandardScaler())

In [10]:
pipeline.fit_transform(X_train)

array([[-0.20454128,  0.96113541,  0.55877667, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -0.45858041,  0.37822996, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -1.52336728, -3.19162718, ..., -1.01618403,
        -0.14801657, -1.08281702],
       ...,
       [-0.20454128, -2.2332252 ,  0.96673904, ...,  1.14117949,
         1.97519523,  1.0606368 ],
       [-0.20454128, -1.87829624, -0.53769926, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128,  0.60620645,  1.48783997, ..., -1.01618403,
        -0.14801657, -1.08281702]])

In [11]:
# pipeline.transform(X_val)
pipeline.transform(X_test)

array([[-0.20454128,  0.2512775 ,  1.11346144, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -1.52336728, -2.05252351, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -0.45858041,  0.01885846, ..., -1.01618403,
        -0.14801657, -1.08281702],
       ...,
       [-0.20454128,  1.67099332, -3.10661319, ..., -1.01618403,
        -0.14801657, -1.08281702],
       [-0.20454128, -1.52336728, -1.18432961, ...,  0.92544313,
        -0.14801657,  0.84629142],
       [-0.20454128, -0.45858041,  0.45137931, ..., -1.01618403,
        -0.14801657, -1.08281702]])

Wyszukiwanie hiperparametrów w regresji logistycznej

In [78]:
def objective(trial):
    params = {
        "C": trial.suggest_float("C", 0.1, 15, log=True),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "max_iter": trial.suggest_categorical("max_iter", [150])
    }

    model = LogisticRegression(**params)
    # model.fit(X_train, y_train)
    # preds = model.predict(X_val)
    #
    # accuracy = accuracy_score(y_val, preds)
    # precision = precision_score(y_val, preds)
    # recall = recall_score(y_val, preds)
    # f1 = f1_score(y_val, preds)
    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_accuracy"])

    return accuracy, precision, recall, f1


In [13]:
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name=["accuracy", "precision", "recall", "f1"])

  mlflow_callback = MLflowCallback(


In [79]:
study = optuna.create_study(study_name="lr", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=30, callbacks=[mlflow_callback])

[32m[I 2022-05-26 20:38:19,856][0m A new study created in memory with name: lr[0m
[32m[I 2022-05-26 20:38:22,681][0m Trial 0 finished with values: [0.696063450333473, 0.8102013762212927, 0.6908995498983164, 0.696063450333473] and parameters: {'C': 1.3875906799756415, 'class_weight': 'balanced', 'n_jobs': -1, 'max_iter': 250}. [0m
[32m[I 2022-05-26 20:38:24,642][0m Trial 1 finished with values: [0.6623915283959233, 0.782608887216441, 0.6602267945913249, 0.6623915283959233] and parameters: {'C': 0.9907432952862475, 'class_weight': 'balanced', 'n_jobs': -1, 'max_iter': 150}. [0m
[32m[I 2022-05-26 20:38:26,303][0m Trial 2 finished with values: [0.6773889344151552, 0.7946412624502279, 0.6739769869337605, 0.6773889344151552] and parameters: {'C': 0.2654276307838985, 'class_weight': 'balanced', 'n_jobs': -1, 'max_iter': 150}. [0m
[32m[I 2022-05-26 20:38:29,093][0m Trial 3 finished with values: [0.7015917033258926, 0.8170815188033259, 0.6924969967788976, 0.7015917033258926] and p

Wyszukiwanie hiperparametrów drzewo decyzyjne

In [14]:
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 10, 40),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 50),
        "max_features": trial.suggest_categorical("max_features", [None]),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
    }

    model = DecisionTreeClassifier(**params)
    # model.fit(X_train, y_train)
    # preds = model.predict(X_val)

    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_accuracy"])

    # accuracy = accuracy_score(y_val, preds)
    # precision = precision_score(y_val, preds)
    # recall = recall_score(y_val, preds)
    # f1 = f1_score(y_val, preds)

    return accuracy, precision, recall, f1

In [15]:
study = optuna.create_study(study_name="dt", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=50, callbacks=[mlflow_callback])

[32m[I 2022-05-27 09:03:37,416][0m A new study created in memory with name: dt[0m
[32m[I 2022-05-27 09:03:40,194][0m Trial 0 finished with values: [0.8301496831157937, 0.9042296874665794, 0.8240113744234806, 0.8301496831157937] and parameters: {'max_depth': 23, 'min_samples_leaf': 25, 'max_features': None, 'class_weight': 'balanced'}. [0m
[32m[I 2022-05-27 09:03:42,301][0m Trial 1 finished with values: [0.8262540634002933, 0.8988739305422978, 0.8233314209112577, 0.8262540634002933] and parameters: {'max_depth': 15, 'min_samples_leaf': 49, 'max_features': None, 'class_weight': 'balanced'}. [0m
[32m[I 2022-05-27 09:03:43,909][0m Trial 2 finished with values: [0.8301288073861338, 0.9035376328146653, 0.824739734031161, 0.8301288073861338] and parameters: {'max_depth': 31, 'min_samples_leaf': 13, 'max_features': None, 'class_weight': 'balanced'}. [0m
[32m[I 2022-05-27 09:03:44,927][0m Trial 3 finished with values: [0.8304316761763971, 0.8952530853785969, 0.8348577781036124, 0.

In [16]:
print(mlflow.get_tracking_uri())

file:///C:\Users\Tuszyn\Desktop\JT_praca_magisterska\notebooks\experiments
