In [1]:
# Import libraries

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score


In [2]:
# Name the mlflow experiment
mlflow.set_experiment("Titanic Survival Prediction")


2025/12/18 08:56:15 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/18 08:56:15 INFO mlflow.store.db.utils: Updating database tables
2025/12/18 08:56:15 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/18 08:56:15 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/18 08:56:15 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/18 08:56:15 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='/Users/zahra.mohammadi/Documents/mlops_labs/mlruns/1', creation_time=1765981748090, experiment_id='1', last_update_time=1765981748090, lifecycle_stage='active', name='Titanic Survival Prediction', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [3]:
# Import and Preprocess the titanix survival dataset
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [5]:
#Load Titanic dataset
data = pd.read_csv("data/titanic.csv")

# print(data.head())
# drop the rows with missing values 
data = data.dropna(subset=['Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'Survived'])

# Feature engineering 
data['Sex'] = data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
data = pd.get_dummies(data, columns=['Embarked'])
# convert embarked to numeric values
data['Embarked_C'] = data['Embarked_C'].astype(int)
data['Embarked_Q'] = data['Embarked_Q'].astype(int)
data['Embarked_S'] = data['Embarked_S'].astype(int)
print(data.head())

X = data[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y = data['Survived']



   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare Cabin  Embarked_C  Embarked_Q  Embarked_S  
0         A/5 21171   7.2500   NaN           0           0           1  
1          PC 17599  71.2833   C85           1           0           0  
2  STON/O2. 3101282   7.9250   NaN           0           0           1  
3   

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [7]:
def train_and_log_model(model, model_name, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Log model parameters 
        if hasattr(model, 'n_estimators'):
            mlflow.log_param("n_estimators", model.n_estimators)
        if hasattr(model, 'max_depth'):
            mlflow.log_param("max_depth", model.max_depth)
        if hasattr(model, 'max_iter'):
            mlflow.log_param("max_iter", model.max_iter)
  

        params = model.get_params()
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Log metrics to MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("f1_score", f1)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"Logged {model_name} with accuracy: {accuracy}, precision: {precision}, f1_score: {f1}")

In [8]:
# Train and log the logistinc regression model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=200)
train_and_log_model(log_reg, "Logistic_Regression", X_train, X_test, y_train, y_test)



Logged Logistic_Regression with accuracy: 0.7972027972027972, precision: 0.8695652173913043, f1_score: 0.7339449541284404


In [9]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
train_and_log_model(rf_clf, "Random_Forest_Classifier", X_train, X_test, y_train, y_test)



Logged Random_Forest_Classifier with accuracy: 0.7902097902097902, precision: 0.8666666666666667, f1_score: 0.7222222222222222


In [10]:
#Hyperparameter tuning for Random Forest Classifier
from sklearn.model_selection import GridSearchCV

# define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Configure Grid Search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Perform Grid Search and log the best model to MLflow
with mlflow.start_run(run_name="Random Forest Hyperparameter Tuning using Grid Search") as run:
    #Fit Gris search 
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)
    best_params = grid_search.best_params_

    #log best model parameters
    mlflow.log_param("best_n_estimators", grid_search.best_params_["n_estimators"])
    mlflow.log_param("best_max_depth", grid_search.best_params_["max_depth"])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_["min_samples_split"])

    #calculate and log metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("f1_score", f1)

    #Log the best model
    mlflow.sklearn.log_model(best_rf, "Best_Random_Forest_Classifier")
    print(f"Logged Best Random Forest Classifier with accuracy: {accuracy}, precision: {precision}, f1_score: {f1}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits




Logged Best Random Forest Classifier with accuracy: 0.7902097902097902, precision: 0.8235294117647058, f1_score: 0.7368421052631579
