In [1]:
!pip install mlflow



In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
from google.colab import files

In [3]:
# Function to preprocess the data
def preprocess_data(data):
    # Drop unnecessary columns
    data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

    # Handle missing values
    data["Age"].fillna(data["Age"].median(), inplace=True)  # Fill missing Age with median
    data["Embarked"].fillna(data["Embarked"].mode()[0], inplace=True)  # Fill missing Embarked with mode

    # Convert categorical variables to numerical
    data["Sex"] = data["Sex"].map({"male": 0, "female": 1})  # Encode Sex
    data = pd.get_dummies(data, columns=["Embarked"], drop_first=True)  # One-hot encode Embarked

    return data

In [4]:
# Function to train the model and log with MLflow
def train_model_with_mlflow(X_train, X_test, y_train, y_test, max_depth, criterion="gini"):
    # Start an MLflow experiment
    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("criterion", criterion)

        # Train the Decision Tree Classifier
        clf = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=42)
        clf.fit(X_train, y_train)

        # Evaluate the model
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision_0", report["0"]["precision"])
        mlflow.log_metric("recall_0", report["0"]["recall"])
        mlflow.log_metric("f1_score_0", report["0"]["f1-score"])
        mlflow.log_metric("precision_1", report["1"]["precision"])
        mlflow.log_metric("recall_1", report["1"]["recall"])
        mlflow.log_metric("f1_score_1", report["1"]["f1-score"])

        # Visualize and save the decision tree
        plt.figure(figsize=(12, 8))
        plot_tree(clf, feature_names=X_train.columns, class_names=["Not Survived", "Survived"], filled=True)
        tree_image_path = "decision_tree.png"
        plt.savefig(tree_image_path)
        plt.close()

        # Log the decision tree image as an artifact
        mlflow.log_artifact(tree_image_path)

        # Log the model
        mlflow.sklearn.log_model(clf, "decision_tree_model")

        print(f"Model trained with max_depth={max_depth}, criterion={criterion}")
        print(f"Accuracy: {accuracy:.2f}")

        # Download the decision tree image for inspection
        files.download(tree_image_path)

In [5]:
# Main script
if __name__ == "__main__":
    # Upload the dataset using Google Colab's file upload functionality
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]
    train_data = pd.read_csv(file_name)

    # Preprocess the data
    train_data = preprocess_data(train_data)

    # Define features (X) and target (y)
    X = train_data.drop(columns=["Survived"])  # Features
    y = train_data["Survived"]  # Target variable

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model with MLflow tracking
    train_model_with_mlflow(X_train, X_test, y_train, y_test, max_depth=3, criterion="gini")

Saving train.csv to train (2).csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Age"].fillna(data["Age"].median(), inplace=True)  # Fill missing Age with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Embarked"].fillna(data["Embarked"].mode()[0], inplace=True)  # Fill missing Embarked with mode


Model trained with max_depth=3, criterion=gini
Accuracy: 0.80


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!mlflow ui --port 5000

[2025-05-30 14:10:04 +0000] [1840] [INFO] Starting gunicorn 23.0.0
[2025-05-30 14:10:04 +0000] [1840] [INFO] Listening at: http://127.0.0.1:5000 (1840)
[2025-05-30 14:10:04 +0000] [1840] [INFO] Using worker: sync
[2025-05-30 14:10:04 +0000] [1841] [INFO] Booting worker with pid: 1841
[2025-05-30 14:10:04 +0000] [1842] [INFO] Booting worker with pid: 1842
[2025-05-30 14:10:04 +0000] [1847] [INFO] Booting worker with pid: 1847
[2025-05-30 14:10:04 +0000] [1848] [INFO] Booting worker with pid: 1848
