In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Load the Data

In [3]:
titanic = pd.read_csv("../../data/titanic/train_and_test2.csv")
titanic.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
0,1,22.0,7.25,0,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.925,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.05,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0


## EDA

In [4]:
# Fix typo in column name

titanic = titanic.rename(columns={'2urvived': 'Survived'})
titanic.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,Survived
0,1,22.0,7.25,0,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.925,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.05,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0


In [5]:
# Drop unnecessary columns

titanic = titanic.drop(columns=[col for col in titanic.columns if 'zero' in col or col.lower() == 'passengerid'])

In [6]:
titanic.head()

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked,Survived
0,22.0,7.25,0,1,0,3,2.0,0
1,38.0,71.2833,1,1,0,1,0.0,1
2,26.0,7.925,1,0,0,3,2.0,1
3,35.0,53.1,1,1,0,1,2.0,1
4,35.0,8.05,0,0,0,3,2.0,0


In [7]:
# Check for missing values

print(titanic.isna().sum())

Age         0
Fare        0
Sex         0
sibsp       0
Parch       0
Pclass      0
Embarked    2
Survived    0
dtype: int64


In [8]:
# Impute missing values with the mode

titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])

In [9]:
# Create dummy variables for categorical columns

titanic = pd.get_dummies(titanic, columns=['Sex', 'Embarked'], drop_first=True)

In [10]:
titanic.head()

Unnamed: 0,Age,Fare,sibsp,Parch,Pclass,Survived,Sex_1,Embarked_1.0,Embarked_2.0
0,22.0,7.25,1,0,3,0,False,False,True
1,38.0,71.2833,1,0,1,1,True,False,False
2,26.0,7.925,0,0,3,1,True,False,True
3,35.0,53.1,1,0,1,1,True,False,True
4,35.0,8.05,0,0,3,0,False,False,True


In [11]:
# Make the categoricals show up as 0 or 1 instead of booleans

titanic[['Sex_1', 'Embarked_1.0', 'Embarked_2.0']] = titanic[['Sex_1', 'Embarked_1.0', 'Embarked_2.0']].astype(int)

In [12]:
titanic.head()

Unnamed: 0,Age,Fare,sibsp,Parch,Pclass,Survived,Sex_1,Embarked_1.0,Embarked_2.0
0,22.0,7.25,1,0,3,0,0,0,1
1,38.0,71.2833,1,0,1,1,1,0,0
2,26.0,7.925,0,0,3,1,1,0,1
3,35.0,53.1,1,0,1,1,1,0,1
4,35.0,8.05,0,0,3,0,0,0,1


Looks good.

## MLFlow Setup

In [13]:
X = titanic.drop(columns=["Survived"])
y = titanic["Survived"]

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [21]:
mlflow.set_experiment("my_heart_will_go_on")

<Experiment: artifact_location='file:///Users/teato/Documents/msds_virtual_environment/msds/USF%20MSDS/Spring%20Mod2/603%20-%20ML%20Ops/labs/lab2/mlruns/304166878576808726', creation_time=1742884251718, experiment_id='304166878576808726', last_update_time=1742884251718, lifecycle_stage='active', name='my_heart_will_go_on', tags={}>

### First Model: Logistic Regression

In [22]:
for C in [0.01, 0.1, 1, 10]:
    with mlflow.start_run():
        model = LogisticRegression(C=C, max_iter=1000, random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)

        mlflow.log_param("model_type", "LogisticRegression")
        mlflow.log_param("C", C)
        mlflow.log_param("max_iter", 1000)
        mlflow.log_metric("val_accuracy", acc)
        mlflow.sklearn.log_model(model, "logistic_model")



### Second Model: Decision Trees

In [23]:
for depth in [3, 5, 7, 9]:
    with mlflow.start_run():
        model = DecisionTreeClassifier(max_depth=depth, random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)

        mlflow.log_param("model_type", "DecisionTree")
        mlflow.log_param("max_depth", depth)
        mlflow.log_metric("val_accuracy", acc)
        mlflow.sklearn.log_model(model, "decision_tree_model")



### Third Model: Random Forest

In [24]:
for depth in [5, 7, 10]:
    for n_estimators in [50, 100, 200]:
        with mlflow.start_run():
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=depth, random_state=42)
            model.fit(X_train, y_train)
            preds = model.predict(X_val)
            acc = accuracy_score(y_val, preds)

            mlflow.log_param("model_type", "RandomForest")
            mlflow.log_param("max_depth", depth)
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_metric("val_accuracy", acc)
            mlflow.sklearn.log_model(model, "random_forest_model")



### Finding the Best Model

All of the models aren't showing up on the MLFlow UI, so I'm going to find the best one here.

In [28]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiment = client.get_experiment_by_name("my_heart_will_go_on")
runs = client.search_runs(experiment.experiment_id)

print(f"Found {len(runs)} runs!")
for run in runs:  # preview first 5
    print(run.info.run_id, run.data.metrics)

Found 38 runs!
ea92c5cb2c8642efa67c267a94eeda4d {'test_metric': 0.99}
8035a55d2ac2487bb258ff3a9f3cd56b {'val_accuracy': 0.7824427480916031}
c68ccadecb19436fb5623875f4d72051 {'val_accuracy': 0.7824427480916031}
4b10558a5e114e0d9bf4f01ba2397e36 {'val_accuracy': 0.7862595419847328}
c03fffdc0b69455d9fdd2762c14e0516 {'val_accuracy': 0.7900763358778626}
2c4fea560d0d490b95318b1ba9e857e1 {'val_accuracy': 0.7824427480916031}
ff2d66afed9742bc9ea90d854918bdcf {'val_accuracy': 0.7900763358778626}
aeb98e29394d46838ae9769297cb6fba {'val_accuracy': 0.8129770992366412}
3014563f3406421db12f5e415511cffb {'val_accuracy': 0.7862595419847328}
b9f942ed95114d589939d96757b403c2 {'val_accuracy': 0.7938931297709924}
fde5215f9d754ba5ae783ebfce20c301 {'val_accuracy': 0.7900763358778626}
c550c3dedf754dc1aa57932b25f81a01 {'val_accuracy': 0.7748091603053435}
55bde9e5d2744eeab73e89f8f1cac0db {'val_accuracy': 0.7938931297709924}
5dfea357935149d78dff8895faf844ce {'val_accuracy': 0.8091603053435115}
fc4c49e225cf4a20a3ce

In [29]:
run_id = "b9f942ed95114d589939d96757b403c2"
run = client.get_run(run_id)
print(run.data.params)

{'max_depth': '5', 'model_type': 'RandomForest', 'n_estimators': '50'}


In [None]:
X_full_train = pd.concat([X_train, X_val])
y_full_train = pd.concat([y_train, y_val])

with mlflow.start_run():
    final_model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
    final_model.fit(X_full_train, y_full_train)

    test_preds = final_model.predict(X_test)
    test_acc = accuracy_score(y_test, test_preds)

    mlflow.log_param("model_type", "RandomForest_FINAL")
    mlflow.log_param("n_estimators", 50)
    mlflow.log_param("max_depth", 5)
    mlflow.log_metric("test_accuracy", test_acc)

    mlflow.sklearn.log_model(final_model, "final_model")

    mlflow.register_model(
        model_uri=f"runs:/{mlflow.active_run().info.run_id}/final_model",
        name="TitanicFinalModel"
    )

Successfully registered model 'TitanicFinalModel'.
Created version '1' of model 'TitanicFinalModel'.
