In [1]:
import mlflow
from mlflow.models import infer_signature
from mlflow.data.pandas_dataset import PandasDataset

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_auc_score

from xgboost import XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import os

In [2]:
gen_dirname = os.path.dirname(os.path.abspath(''))

## Load the dataset

In [3]:
type_of_dataset = "gentle"
source_dataset = os.path.join(gen_dirname,f"data\{type_of_dataset}\labelled.csv")

labeled_data = pd.read_csv(source_dataset)

labels = labeled_data["Survived"]
inputs = labeled_data.drop("Survived",axis="columns")

X_train, X_test, y_train, y_test = train_test_split(inputs,labels,test_size=0.3,random_state=42) # We are fixing the split so every run is comparable 

## Models

In [4]:
list_models = []

### LogisticRegression

In [5]:
# Define the model hyperparameters
params_lr = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}

# Create model instance
lr = LogisticRegression(**params_lr)

# Register in list 
list_models.append(["LogisticRegression",params_lr,lr,mlflow.sklearn.autolog])#,model_info])

### XGBoost

In [6]:
# Define the model hyperparameters
params_xgb = {
    "n_estimators":20,
    "max_depth":100,
    "learning_rate": 0.3,
    "objective": "binary:logistic",
}

# Create model instance
bst = XGBClassifier(**params_xgb)

# Register in list 
list_models.append(["XGBoost",params_xgb,bst,mlflow.xgboost.autolog])#,model_info])



## Evaluation metrics

We are gonna set our metrics that will help compare our different models, because it is a classification task we are gonna focus on AUC, accuracy, recall, confusion matrix.

In [7]:
def eval_metrics(actual, preds):
    # Calculate metrics

    # Accuracy
    accuracy = accuracy_score(actual, preds)

    # recall
    recall = recall_score(actual, preds)

    # AUC
    auc = roc_auc_score(actual, preds)

    # Confusion matrix
    cnf_matr = confusion_matrix(actual,preds)

    return accuracy, recall, auc, cnf_matr

## MLFlow part

In [8]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("Titanic MLFlow demo")


for name,params,model,log_model in list_models:

    # Start an MLflow run
    with mlflow.start_run():
        
        log_model()

        # Log the hyperparameters
        mlflow.log_params(params)

        # Fit the model on training data
        model.fit(X_train, y_train)

        log_model(disable=True)
        # Final evaluation on the training sample
        preds_train = model.predict(X_train)

        # Log the train metric
        accuracy_train, recall_train, auc_train, cnf_matr_train = eval_metrics(y_train,preds_train)
        mlflow.log_metric("accuracy_train", accuracy_train)
        mlflow.log_metric("recall_train", recall_train)
        mlflow.log_metric("auc_train", auc_train)

        fig, ax = plt.subplots()

        sns.heatmap(cnf_matr_train, annot=True)
        ax.set_title("Feature confusion Matrix Test Set", fontsize=14)
        plt.tight_layout()
        plt.close(fig)

        mlflow.log_figure(fig, "confusion_matrix_train.png")
        
        log_model(disable=False)
        # Make some prediction on the test set
        preds_test = model.predict(X_test)

        # Log the tests metric
        accuracy_test, recall_test, auc_test, cnf_matr_test = eval_metrics(y_test,preds_test)
        mlflow.log_metric("accuracy_test", accuracy_test)
        mlflow.log_metric("recall_test", recall_test)
        mlflow.log_metric("auc_test", auc_test)

        fig, ax = plt.subplots()

        sns.heatmap(cnf_matr_test, annot=True)
        ax.set_title("Feature confusion Matrix Test Set", fontsize=14)
        plt.tight_layout()
        plt.close(fig)

        mlflow.log_figure(fig, "confusion_matrix_test.png")

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{name} model training for {type_of_dataset} titanic dataset")

        # mlflow.set_tag("mlflow.runName", f"{name}")



In [10]:
model_name = "XGBoost"
model_version = "1"
# Load saved model and make predictions
model_uri = f"models:/{model_name}/{model_version}"
loaded_model = mlflow.pyfunc.load_model(model_uri)

inference_dataset = os.path.join(gen_dirname,f"data\\{type_of_dataset}\\unlabelled.csv")

unllabeled_data = pd.read_csv(inference_dataset)

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \uXXXX escape (822299344.py, line 7)