In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report

from lightgbm import LGBMClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import mlflow
import logging
import pandas as pd
import numpy as np

logging.getLogger('mlflow').setLevel(logging.ERROR)

In [190]:
def load_arff_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        data_started = False
        header = []
        data = []
        for line in lines:
            if '@DATA' in line or '@data' in line:
                data_started = True
                continue
            if not data_started and ('@ATTRIBUTE' in line or '@attribute' in line):
                # Extract attribute names for header
                attr_name = line.split()[1].strip("'")
                header.append(attr_name)
            elif data_started:
                # Add data rows
                data.append(line.strip().split(','))
        df = pd.DataFrame(data, columns=header)
    return df

# Replace 'path_to_your_file.arff' with your ARFF file path
df = load_arff_file('archive (1)/KDDTrain+.arff')

In [191]:
# Assuming 'df' is your DataFrame and 'target' is the target column
df = df.dropna()
df = df.drop_duplicates()
X = df.drop('class', axis=1)
y = df['class']

# Using a dictionary for mapping
label_mapping = {'normal': 0, 'anomaly': 1}
y_encoded = y.map(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns

# Preprocessing Pipeline
preprocess_pipeline = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler())]), num_cols),
        ('cat', Pipeline(steps=[
            ('encoder', OneHotEncoder(handle_unknown='ignore'))]), cat_cols)
    ])

In [192]:
# Define the hyperparameter space for LightGBM
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'num_leaves': hp.choice('num_leaves', range(20, 150)),
    'max_depth': hp.choice('max_depth', range(-1, 20)),  # -1 means no limit
    'min_child_samples': hp.choice('min_child_samples', range(20, 500)),
    'max_bin': hp.choice('max_bin', range(200, 300)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),  # Fraction of the training data to use for learning
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),  # Subsample ratio of columns when constructing each tree
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),  # L1 regularization
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0)  # L2 regularization
}

def objective(params):
    params['verbosity'] = -1
    
    mlflow.lightgbm.autolog(log_input_examples=False, log_model_signatures=True, log_models=True, log_datasets=False, disable=False
                            , exclusive = True, silent = True)

    with mlflow.start_run():
        model = Pipeline(steps=[
            ('preprocessor', preprocess_pipeline),  # assuming preprocess_pipeline is defined
            ('classifier', LGBMClassifier(**params))
        ])
        
        stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        # Get predictions for each fold
        y_pred_cv = cross_val_predict(model, X_train, y_train, cv=stratified_kfold)
        
        # Metrics calculation
        accuracy_cv = accuracy_score(y_train, y_pred_cv)
        recall_pos_cv = recall_score(y_train, y_pred_cv, pos_label=1)
        recall_neg_cv = recall_score(y_train, y_pred_cv, pos_label=0)
        conf_matrix_cv = confusion_matrix(y_train, y_pred_cv)
        class_report_cv = classification_report(y_train, y_pred_cv, output_dict=True)

        # Log CV metrics
        mlflow.log_params(params)
        mlflow.log_metric("cv_accuracy", accuracy_cv)
        mlflow.log_metric("cv_recall_pos", recall_pos_cv)
        mlflow.log_metric("cv_recall_neg", recall_neg_cv)
        mlflow.log_dict({"cv_conf_matrix": conf_matrix_cv.tolist()}, "cv_confusion_matrix.json")
        mlflow.log_dict({"cv_class_report": class_report_cv}, "cv_classification_report.json")
        
        # Fit the model on the entire training set and evaluate on the test set
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        recall_pos_test = recall_score(y_test, y_pred_test, pos_label=1)
        recall_neg_test = recall_score(y_test, y_pred_test, pos_label=0)
        conf_matrix_test = confusion_matrix(y_test, y_pred_test)
        class_report_test = classification_report(y_test, y_pred_test, output_dict=True)

        # Log test set metrics
        mlflow.log_metric("test_accuracy", accuracy_test)
        mlflow.log_metric("test_recall_pos", recall_pos_test)
        mlflow.log_metric("test_recall_neg", recall_neg_test)
        mlflow.log_dict({"test_conf_matrix": conf_matrix_test.tolist()}, "test_confusion_matrix.json")
        mlflow.log_dict({"test_class_report": class_report_test}, "test_classification_report.json")

        return {'loss': -accuracy_cv, 'status': STATUS_OK}


mlflow.set_experiment("LightGBM_Hyperopt")
trials = Trials()
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best parameters:", best_params)

100%|██████████| 50/50 [08:11<00:00,  9.82s/trial, best loss: -0.998670252354348]
Best parameters: {'colsample_bytree': 0.962444589187136, 'learning_rate': 0.1266355280786871, 'max_bin': 91, 'max_depth': 17, 'min_child_samples': 6, 'num_leaves': 73, 'reg_alpha': 0.01091363696449077, 'reg_lambda': 0.043220216138593504, 'subsample': 0.6722740266677719}


In [182]:
search_df = mlflow.search_runs()

# Initialize a boolean Series to keep track of matching rows
mask = pd.Series([True] * len(search_df))

for param, value in best_params.items():
    
    search_df[f'params.{param}'] = pd.to_numeric(search_df[f'params.{param}'], errors='coerce')
    
    # Update the mask
    for i in range(len(search_df)):
        mask[i] = (search_df[f'params.{param}'][i] == value)
        
# Filter search_df using the mask to find the row with matching parameters
best_run_row = search_df[mask]

In [189]:
best_run_row.T

Unnamed: 0,26
run_id,37cdc7e7cdcb47308b709c87137fdf4e
experiment_id,777459170516494404
status,FINISHED
artifact_uri,file:///Users/tyrellto/Documents/AzureProject1...
start_time,2023-12-29 05:19:26.935000+00:00
end_time,2023-12-29 05:19:30.174000+00:00
metrics.test_recall_neg,0.997756
metrics.cv_recall_neg,0.997773
metrics.cv_recall_pos,0.995415
metrics.cv_accuracy,0.996675


In [184]:
artifacts = [
    f.path for f in mlflow.MlflowClient().list_artifacts(best_run_row['run_id'].values[0])
]

In [193]:
# mlflow.MlflowClient().download_artifacts(best_run_row['run_id'].values[0], artifacts[0])

In [187]:
!mlflow ui

[2023-12-28 23:49:52 -0600] [69484] [INFO] Starting gunicorn 21.2.0
[2023-12-28 23:49:52 -0600] [69484] [INFO] Listening at: http://127.0.0.1:5000 (69484)
[2023-12-28 23:49:52 -0600] [69484] [INFO] Using worker: sync
[2023-12-28 23:49:52 -0600] [69485] [INFO] Booting worker with pid: 69485
[2023-12-28 23:49:52 -0600] [69486] [INFO] Booting worker with pid: 69486
[2023-12-28 23:49:52 -0600] [69487] [INFO] Booting worker with pid: 69487
[2023-12-28 23:49:52 -0600] [69488] [INFO] Booting worker with pid: 69488
^C
[2023-12-28 23:50:21 -0600] [69484] [INFO] Handling signal: int
[2023-12-28 23:50:21 -0600] [69485] [INFO] Worker exiting (pid: 69485)
[2023-12-28 23:50:21 -0600] [69488] [INFO] Worker exiting (pid: 69488)
[2023-12-28 23:50:21 -0600] [69486] [INFO] Worker exiting (pid: 69486)
[2023-12-28 23:50:21 -0600] [69487] [INFO] Worker exiting (pid: 69487)
