In [41]:
import mlflow
from mlflow.models import infer_signature


from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_auc_score

from xgboost import XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import os

In [42]:
gen_dirname = os.path.dirname(os.path.abspath(''))

## Load the dataset

In [43]:
type_of_dataset = "gentle"

labeled_data = pd.read_csv(os.path.join(gen_dirname,f"data\{type_of_dataset}\labelled.csv"))

labels = labeled_data["Survived"]
inputs = labeled_data.drop("Survived",axis="columns")

X_train, X_test, y_train, y_test = train_test_split(inputs,labels,test_size=0.3,random_state=42) # We are fixing the split so every run is comparable 

## Models

In [44]:
list_models = []

### XGBoost

In [45]:
# Define the model hyperparameters
params_xgb = {
    "n_estimators":20,
    "max_depth":100,
    "learning_rate": 0.3,
    "objective": "binary:logistic",
}

# Create model instance
bst = XGBClassifier(**params_xgb)

# Fit the model
bst.fit(X_train, y_train)

# # Infer the model signature
# signature = infer_signature(X_train, bst.predict(X_train))

# # Log the model
# model_info = mlflow.xgboost.autolog()

# Register in list 
list_models.append(["XGBoost",params_xgb,bst,mlflow.xgboost.log_model])#,model_info])



### LogisticRegression

In [46]:
# Define the model hyperparameters
params_lr = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}

# Create model instance
lr = LogisticRegression(**params_lr)

# Fit the model
lr.fit(X_train, y_train)

# # Log the model
# model_info = mlflow.sklearn.autolog()

# Register in list 
list_models.append(["LogisticRegression",params_lr,lr,mlflow.sklearn.log_model])#,model_info])



## Evaluation metrics

We are gonna set our metrics that will help compare our different models, because it is a classification task we are gonna focus on AUC, accuracy, recall, confusion matrix.

In [47]:
def eval_metrics(actual, preds):
    # Calculate metrics

    # Accuracy
    accuracy = accuracy_score(actual, preds)

    # recall
    recall = recall_score(actual, preds)

    # AUC
    auc = roc_auc_score(actual, preds)

    # Confusion matrix
    cnf_matr = confusion_matrix(actual,preds)

    return accuracy, recall, auc, cnf_matr

## MLFlow part

In [48]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("Titanic MLFlow demo")


for name,params,model,log_model in list_models:

    # Start an MLflow run
    with mlflow.start_run():
        # Log the hyperparameters
        mlflow.log_params(params)
        
        preds = model.predict(X_test)

        # Log the metric
        accuracy, recall, auc, cnf_matr = eval_metrics(y_test,preds)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("auc", auc)

        fig, ax = plt.subplots()

        sns.heatmap(cnf_matr, annot=True)
        ax.set_title("Feature confusion Matrix", fontsize=14)
        plt.tight_layout()
        plt.close(fig)

        mlflow.log_figure(fig, "confusion_matrix.png")

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{name} model training for {type_of_dataset} titanic dataset")

        mlflow.set_tag("mlflow.runName", f"{name}")


        # model_info = log_model()
        # # Infer the model signature
        # signature = infer_signature(X_train, model.predict(X_train))

        # model_info =log_model(
        #     artifact_path=f"{type_of_dataset}_{name}",
        #     signature=signature,
        #     input_example=X_train,
        #     registered_model_name=f" {name}",
        # )