In [17]:
from pathlib import Path

import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import mlflow
from optuna.integration import MLflowCallback

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [14]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
Path(MODEL_REGISTRY).mkdir(exist_ok=True)  # create experiments dir
mlflow.set_tracking_uri("file:///" + str(MODEL_REGISTRY.absolute()))

In [3]:
df = pd.read_csv("../data/proccessed/globalterrordb_proccesed.csv", index_col=0)

In [29]:
df.describe(include="all").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
extended,119687.0,0.040489,0.197104,0.0,0.0,0.0,0.0,1.0
region,119687.0,7.296239,2.81656,1.0,6.0,6.0,10.0,12.0
latitude,119687.0,23.462759,17.853289,-53.154613,11.649417,31.600629,34.437939,74.633553
longitude,119687.0,33.205837,55.14186,-157.818968,12.490069,44.371771,69.383108,179.366667
specificity,119686.0,1.37057,0.818686,1.0,1.0,1.0,1.0,4.0
vicinity,119687.0,0.066791,0.249661,0.0,0.0,0.0,0.0,1.0
multiple,119687.0,0.159917,0.366531,0.0,0.0,0.0,0.0,1.0
success,119687.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
suicide,119687.0,0.037824,0.19077,0.0,0.0,0.0,0.0,1.0
attacktype1,119687.0,3.280289,1.828469,1.0,2.0,3.0,3.0,9.0


In [4]:
df.fillna(value=-9, inplace=True)

In [5]:
y = df["cas_class"]
X = df.drop("cas_class", axis=1)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, test_size=0.25, stratify=y_train)

In [7]:
impute_value = -9

In [8]:
pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=impute_value),
                         StandardScaler())

In [9]:
pipeline.fit_transform(X_train)

array([[-0.2035149 ,  0.60373644,  1.10873672, ..., -1.08014204,
        -0.85284415, -0.52435801],
       [-0.2035149 ,  0.95885428,  0.68576861, ...,  1.06308836,
         1.90854282,  1.78257866],
       [-0.2035149 , -0.4616171 , -0.13845736, ..., -1.08014204,
         0.2747222 ,  0.24575256],
       ...,
       [-0.2035149 , -0.81673495, -0.44779573, ..., -1.08014204,
        -0.34430395, -0.52435801],
       [-0.2035149 , -0.81673495, -0.94373996, ..., -1.08014204,
        -0.5943832 , -0.52435801],
       [-0.2035149 ,  0.95885428,  0.54991043, ..., -1.08014204,
         1.90854282, -0.52435801]])

In [10]:
pipeline.transform(X_val)
pipeline.transform(X_test)

array([[-0.2035149 ,  1.31397213, -1.293771  , ...,  1.06308836,
        -0.96759715, -0.40496541],
       [-0.2035149 ,  0.95885428,  0.53947394, ..., -1.08014204,
         1.90854282, -0.52435801],
       [-0.2035149 ,  0.95885428,  0.54991043, ..., -1.08014204,
         1.90854282, -0.52435801],
       ...,
       [-0.2035149 ,  0.95885428,  0.54991043, ..., -1.08014204,
         1.90854282, -0.52435801],
       [-0.2035149 , -0.81673495, -0.93188165, ...,  0.84876532,
        -0.34430395,  0.52810758],
       [-0.2035149 , -2.23720633,  1.02973381, ..., -1.08014204,
        -0.7749706 , -0.51417286]])

In [11]:
def objective(trial):
    params = {
        "C": trial.suggest_float("C", 1, 50, log=True),

    }

    model = LogisticRegression(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    accuracy = accuracy_score(y_val, preds)
    precision = precision_score(y_val, preds)

    return accuracy, precision


In [18]:
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name=["accuracy", "precision"])

  mlflow_callback = MLflowCallback(


In [20]:
print(mlflow.get_tracking_uri())

file:///C:\Users\Tuszyn\Desktop\JT_praca_magisterska\notebooks\experiments


In [19]:
study = optuna.create_study(study_name="lr", directions=["maximize", "maximize"])
study.optimize(objective, n_trials=10, callbacks=[mlflow_callback])

[32m[I 2022-05-23 22:55:53,579][0m A new study created in memory with name: lr[0m
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[32m[I 2022-05-23 22:55:53,964][0m Trial 0 finished with values: [0.718689949035007, 0.7454483963699904] and parameters: {'C': 2.1358355544248844}. [0m
2022/05/23 22:55:53 INFO mlflow.tracking.fluent: Experiment with name 'lr' does not exist. Creating a new experiment.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https:/