In [1]:
import mlflow
from mlflow.models import infer_signature
from mlflow.data.pandas_dataset import PandasDataset

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from xgboost import XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import os

In [2]:
gen_dirname = os.path.dirname(os.path.abspath(''))

## Load the dataset

In [3]:
type_of_dataset = "gentle"
source_dataset = os.path.join(gen_dirname,f"data\{type_of_dataset}\labelled.csv")

labeled_data = pd.read_csv(source_dataset)

labels = labeled_data["Survived"]
inputs = labeled_data.drop("Survived",axis="columns")

X_train, X_test, y_train, y_test = train_test_split(inputs,labels,test_size=0.3,random_state=42) # We are fixing the split so every run is comparable 

## Models

In [4]:
list_models = []

### LogisticRegression

In [5]:
# Define the model hyperparameters
params_lr = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}

# Create model instance
lr = LogisticRegression(**params_lr)

# Register in list 
list_models.append(["LogisticRegression",params_lr,lr,mlflow.sklearn.autolog])#,model_info])

### XGBoost

In [6]:
# Define the model hyperparameters
params_xgb = {
    "n_estimators":20,
    "max_depth":100,
    "learning_rate": 0.3,
    "objective": "binary:logistic",
}

# Create model instance
bst = XGBClassifier(**params_xgb)

# Register in list 
list_models.append(["XGBoost",params_xgb,bst,mlflow.xgboost.autolog])#,model_info])



## Evaluation metrics

We are gonna set our metrics that will help compare our different models, because it is a classification task we are gonna focus on AUC, accuracy, recall, confusion matrix.

In [7]:
def eval_metrics(actual, preds):
    # Calculate metrics

    # Accuracy
    accuracy = accuracy_score(actual, preds)

    # recall
    recall = recall_score(actual, preds)

    # precision
    precision = precision_score(actual, preds)

    # Confusion matrix
    cnf_matr = confusion_matrix(actual,preds)
    cnf_matr_nm = cnf_matr.astype('float') / cnf_matr.sum(axis=1)[:, np.newaxis]

    return accuracy, recall, precision, cnf_matr_nm

## MLFlow part

In [8]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("Titanic MLFlow demo")


for name,params,model,log_model in list_models:

    # Start an MLflow run
    with mlflow.start_run():
        
        log_model()

        # Log the hyperparameters
        mlflow.log_params(params)

        # Fit the model on training data
        model.fit(X_train, y_train)

        log_model(disable=True)
        # Final evaluation on the training sample
        preds_train = model.predict(X_train)

        # Log the train metric
        accuracy_train, recall_train, precision_train, cnf_matr_train = eval_metrics(y_train,preds_train)
        mlflow.log_metric("accuracy_train", accuracy_train)
        mlflow.log_metric("recall_train", recall_train)
        mlflow.log_metric("precision_train", precision_train)

        fig, ax = plt.subplots()

        sns.heatmap(cnf_matr_train, annot=True)
        ax.set_title("Normalized confusion Matrix Train Set", fontsize=14)
        plt.xlabel('Predicted Label')
        plt.ylabel('True label')
        plt.tight_layout()
        plt.close(fig)

        mlflow.log_figure(fig, "normalized_confusion_matrix_train.png")
        
        log_model(disable=False)
        # Make some prediction on the test set
        preds_test = model.predict(X_test)

        # Log the tests metric
        accuracy_test, recall_test, precision_test, cnf_matr_test = eval_metrics(y_test,preds_test)
        mlflow.log_metric("accuracy_test", accuracy_test)
        mlflow.log_metric("recall_test", recall_test)
        mlflow.log_metric("precision_test", precision_test)
        fig, ax = plt.subplots()

        sns.heatmap(cnf_matr_test, annot=True)
        ax.set_title("Normalized confusion Matrix Test Set", fontsize=14)
        plt.xlabel('Predicted Label')
        plt.ylabel('True label')
        plt.tight_layout()
        plt.close(fig)

        mlflow.log_figure(fig, "normalized_confusion_matrix_test.png")

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{name} model training for {type_of_dataset} titanic dataset")

        mlflow.set_tag("mlflow.runName", f"{name}")



## Inference with the chosen model

### Loading the model

In [None]:
model_name = "XGBoost" # To be determined
model_version = "1" # Also to be determined

# Load saved model and make predictions
model_uri = f"models:/{model_name}/{model_version}"
loaded_model = mlflow.pyfunc.load_model(model_uri)

### Your survival test

In [33]:
pclass = 3
sex = 0 # Reminder 0 for Male 1 for Female
age = 10
sibsp = 2
parch = 2
fare = 25.5467

one_person_data = pd.DataFrame([[pclass,sex,float(age),sibsp,parch,fare]],columns=["Pclass","Sex","Age","SibSp","Parch","Fare"])
predict_one_person = loaded_model.predict(one_person_data)

if predict_one_person == 0:
    print("You died")
else:
    print("Still standing")

You died


### Testing inference on the whole dataset that was unlabelled  

In [10]:
inference_dataset = os.path.join(gen_dirname,f"data\\{type_of_dataset}\\unlabelled.csv")

unllabeled_data = pd.read_csv(inference_dataset)
predictions = loaded_model.predict(unllabeled_data)

In [14]:
predicted_data = unllabeled_data
predicted_data["Survived"] = predictions

{'Pclass': {0: 3,
  1: 3,
  2: 2,
  3: 3,
  4: 3,
  5: 3,
  6: 3,
  7: 2,
  8: 3,
  9: 3,
  10: 3,
  11: 1,
  12: 1,
  13: 2,
  14: 1,
  15: 2,
  16: 2,
  17: 3,
  18: 3,
  19: 3,
  20: 1,
  21: 3,
  22: 1,
  23: 1,
  24: 1,
  25: 3,
  26: 1,
  27: 3,
  28: 1,
  29: 3,
  30: 2,
  31: 2,
  32: 3,
  33: 3,
  34: 1,
  35: 3,
  36: 3,
  37: 3,
  38: 3,
  39: 3,
  40: 3,
  41: 1,
  42: 3,
  43: 2,
  44: 1,
  45: 3,
  46: 1,
  47: 3,
  48: 1,
  49: 3,
  50: 1,
  51: 2,
  52: 2,
  53: 1,
  54: 2,
  55: 3,
  56: 3,
  57: 3,
  58: 3,
  59: 1,
  60: 3,
  61: 2,
  62: 3,
  63: 3,
  64: 1,
  65: 2,
  66: 3,
  67: 1,
  68: 1,
  69: 1,
  70: 3,
  71: 3,
  72: 3,
  73: 1,
  74: 1,
  75: 1,
  76: 3,
  77: 1,
  78: 2,
  79: 3,
  80: 3,
  81: 1,
  82: 1,
  83: 3,
  84: 2,
  85: 3,
  86: 3,
  87: 3,
  88: 3,
  89: 2,
  90: 3,
  91: 3,
  92: 1,
  93: 3,
  94: 1,
  95: 3,
  96: 1,
  97: 3,
  98: 3,
  99: 3,
  100: 1,
  101: 2,
  102: 3,
  103: 3,
  104: 3,
  105: 3,
  106: 3,
  107: 3,
  108: 3,
  109: 2,
