In [16]:
from pathlib import Path

import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_validate

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import mlflow
from optuna.integration import MLflowCallback

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
#Path(MODEL_REGISTRY).mkdir(exist_ok=True)  # create experiments dir
mlflow.set_tracking_uri("file:///" + str(MODEL_REGISTRY.absolute()))

In [3]:
df = pd.read_csv("../data/proccessed/globalterrordb_proccesed.csv", index_col=0)

In [4]:
df.cas_class.value_counts()

1    77214
0    42473
Name: cas_class, dtype: int64

In [3]:
df.describe(include="all").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
extended,119687.0,0.040489,0.197104,0.0,0.0,0.0,0.0,1.0
region,119687.0,7.296239,2.81656,1.0,6.0,6.0,10.0,12.0
latitude,119687.0,23.462759,17.853289,-53.154613,11.649417,31.600629,34.437939,74.633553
longitude,119687.0,33.205837,55.14186,-157.818968,12.490069,44.371771,69.383108,179.366667
specificity,119686.0,1.37057,0.818686,1.0,1.0,1.0,1.0,4.0
vicinity,119687.0,0.066791,0.249661,0.0,0.0,0.0,0.0,1.0
multiple,119687.0,0.159917,0.366531,0.0,0.0,0.0,0.0,1.0
success,119687.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
suicide,119687.0,0.037824,0.19077,0.0,0.0,0.0,0.0,1.0
attacktype1,119687.0,3.280289,1.828469,1.0,2.0,3.0,3.0,9.0


In [5]:
df.fillna(value=-9, inplace=True)

In [52]:
df.cas_class.value_counts()

1    77214
0    42473
Name: cas_class, dtype: int64

In [7]:
y = df["cas_class"]
X = df.drop("cas_class", axis=1)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, test_size=0.25, stratify=y_train)

In [7]:
impute_value = -9

In [8]:
pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=impute_value),
                         StandardScaler())

In [9]:
pipeline.fit_transform(X_train)

array([[-0.2035149 ,  0.60373644,  1.10873672, ..., -1.08014204,
        -0.85284415, -0.52435801],
       [-0.2035149 ,  0.95885428,  0.68576861, ...,  1.06308836,
         1.90854282,  1.78257866],
       [-0.2035149 , -0.4616171 , -0.13845736, ..., -1.08014204,
         0.2747222 ,  0.24575256],
       ...,
       [-0.2035149 , -0.81673495, -0.44779573, ..., -1.08014204,
        -0.34430395, -0.52435801],
       [-0.2035149 , -0.81673495, -0.94373996, ..., -1.08014204,
        -0.5943832 , -0.52435801],
       [-0.2035149 ,  0.95885428,  0.54991043, ..., -1.08014204,
         1.90854282, -0.52435801]])

In [10]:
pipeline.transform(X_val)
pipeline.transform(X_test)

array([[-0.2035149 ,  1.31397213, -1.293771  , ...,  1.06308836,
        -0.96759715, -0.40496541],
       [-0.2035149 ,  0.95885428,  0.53947394, ..., -1.08014204,
         1.90854282, -0.52435801],
       [-0.2035149 ,  0.95885428,  0.54991043, ..., -1.08014204,
         1.90854282, -0.52435801],
       ...,
       [-0.2035149 ,  0.95885428,  0.54991043, ..., -1.08014204,
         1.90854282, -0.52435801],
       [-0.2035149 , -0.81673495, -0.93188165, ...,  0.84876532,
        -0.34430395,  0.52810758],
       [-0.2035149 , -2.23720633,  1.02973381, ..., -1.08014204,
        -0.7749706 , -0.51417286]])

In [48]:
def objective(trial):
    params = {
        "C": trial.suggest_float("C", 0.5, 20, log=True),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "n_jobs": trial.suggest_categorical("n_jobs", [-1]),
        "max_iter": trial.suggest_categorical("max_iter", [250, 500])
    }

    model = LogisticRegression(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    accuracy = accuracy_score(y_val, preds)
    precision = precision_score(y_val, preds)
    recall = recall_score(y_val, preds)
    f1 = f1_score(y_val, preds)

    return accuracy, precision, recall, f1


In [29]:
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name=["accuracy", "precision", "recall", "f1"])

  mlflow_callback = MLflowCallback(


In [49]:
study = optuna.create_study(study_name="lr", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=10, callbacks=[mlflow_callback])

[32m[I 2022-05-26 17:59:38,694][0m A new study created in memory with name: lr[0m
[32m[I 2022-05-26 17:59:47,584][0m Trial 0 finished with values: [0.7087058233770575, 0.8222002434570906, 0.6997992618014635, 0.7560779375240493] and parameters: {'C': 3.074532702811549, 'class_weight': 'balanced', 'n_jobs': -1, 'max_iter': 250}. [0m
[32m[I 2022-05-26 17:59:55,411][0m Trial 1 finished with values: [0.6945024647004763, 0.6913841807909604, 0.950916272744933, 0.8006433497805523] and parameters: {'C': 2.035577476255282, 'class_weight': {0: 0.35, 1: 0.65}, 'n_jobs': -1, 'max_iter': 250}. [0m
[32m[I 2022-05-26 18:00:10,473][0m Trial 2 finished with values: [0.7463447238699975, 0.776937171227614, 0.8511947160525805, 0.8123725356900068] and parameters: {'C': 14.502488026662416, 'class_weight': None, 'n_jobs': -1, 'max_iter': 500}. [0m
[32m[I 2022-05-26 18:00:19,272][0m Trial 3 finished with values: [0.7244966162586682, 0.7509074410163339, 0.857346370523862, 0.800604686318972] and pa

Testowo

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [13]:
pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-9),
                         StandardScaler())

In [14]:
pipeline.fit_transform(X_train)
pipeline.transform(X_test)

array([[-0.20454128,  0.2512775 ,  1.11346144, ...,  0.84629142,
        -0.78830057, -0.50202743],
       [-0.20454128, -1.52336728, -2.05252351, ..., -1.08281702,
        -0.414869  , -0.52463359],
       [-0.20454128, -0.45858041,  0.01885846, ..., -1.08281702,
        -0.87121762, -0.44268628],
       ...,
       [-0.20454128,  1.67099332, -3.10661319, ..., -1.08281702,
        -1.05656162, -0.52463359],
       [-0.20454128, -1.52336728, -1.18432961, ...,  0.84629142,
        -0.24141385, -0.46585759],
       [-0.20454128, -0.45858041,  0.45137931, ..., -1.08281702,
         0.49813307, -0.44268628]])

In [37]:
def objective(trial):
    params = {
        "C": trial.suggest_float("C", 50, 200, log=True),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced"]),
        "max_iter": trial.suggest_categorical("max_iter", [250])
    }

    model = LogisticRegression(**params)
    # model.fit(X_train, y_train)

    scoring = ["accuracy", "precision", "recall", "f1"]

    preds = cross_validate(model, X_train, y_train, cv=5, n_jobs=-1, scoring=scoring)
    # accuracy = accuracy_score(y_val, preds)
    # precision = precision_score(y_val, preds)
    # recall = recall_score(y_val, preds)
    # f1 = f1_score(y_val, preds)

    accuracy = np.mean(preds["test_accuracy"])
    precision = np.mean(preds["test_precision"])
    recall = np.mean(preds["test_recall"])
    f1 = np.mean(preds["test_accuracy"])

    return accuracy, precision, recall, f1

In [None]:
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name=["accuracy", "precision", "recall", "f1"])

In [38]:
study = optuna.create_study(study_name="test_lr", directions=["maximize", "maximize", "maximize", "maximize"])
study.optimize(objective, n_trials=10)

[32m[I 2022-05-26 19:02:16,922][0m A new study created in memory with name: test_lr[0m
[32m[I 2022-05-26 19:02:20,958][0m Trial 0 finished with values: [0.6905136617922907, 0.8106852254488347, 0.6788622916555699, 0.6905136617922907] and parameters: {'C': 108.73463114655324, 'class_weight': 'balanced', 'max_iter': 250}. [0m
[32m[I 2022-05-26 19:02:25,286][0m Trial 1 finished with values: [0.7039551506687899, 0.8168858042826542, 0.6975117036758666, 0.7039551506687899] and parameters: {'C': 171.54036997853433, 'class_weight': 'balanced', 'max_iter': 250}. [0m
[32m[I 2022-05-26 19:02:29,217][0m Trial 2 finished with values: [0.6968741779970103, 0.8100456699119688, 0.693221641070414, 0.6968741779970103] and parameters: {'C': 177.17063855981704, 'class_weight': 'balanced', 'max_iter': 250}. [0m
[32m[I 2022-05-26 19:02:32,877][0m Trial 3 finished with values: [0.6945974619539199, 0.8090807907742367, 0.6891094776139498, 0.6945974619539199] and parameters: {'C': 170.4886598205659,