In [1]:
import pandas as pd
import numpy as np
import mlflow

In [2]:
# data loader
def load_data(file_name: str):
    # Load the dataset from a CSV file
    df = pd.read_csv(file_name)    
    return df

In [3]:
df = load_data('data/train.csv')

In [4]:
# preprocessing
def preprocess_data(df: pd.DataFrame):
    # encode gender
    df['Sex_Encoded'] = df['Sex'].map({'male': 0, 'female': 1})

    # fill missing values for 'Age' with the median
    df['Age'] = df['Age'].fillna(df['Age'].median())

    # create deck feature from 'Cabin'
    df['Deck'] = df['Cabin'].str[0].fillna('U')  # 'U' for unknown
    df['Deck_Encoded'] = df['Deck'].map({
        'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4,
        'F': 5, 'G': 6, 'T': 7, 'U': 8
    })

    # fill missing values for 'Embarked' with the mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    # encode 'Embarked'
    df['Embarked_Encoded'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

    # drop unnecessary columns
    df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Deck', 'Embarked', 'Sex'])

    # convert all columns to floats
    df = df.astype(float)

    return df


In [5]:
preprocessed_df = preprocess_data(df)

In [6]:
preprocessed_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_Encoded,Deck_Encoded,Embarked_Encoded
0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,8.0,2.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,2.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.9250,1.0,8.0,2.0
3,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,2.0,2.0
4,0.0,3.0,35.0,0.0,0.0,8.0500,0.0,8.0,2.0
...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,27.0,0.0,0.0,13.0000,0.0,8.0,2.0
887,1.0,1.0,19.0,0.0,0.0,30.0000,1.0,1.0,2.0
888,0.0,3.0,28.0,1.0,2.0,23.4500,1.0,8.0,2.0
889,1.0,1.0,26.0,0.0,0.0,30.0000,0.0,2.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
def train_model(preprocessed_df: pd.DataFrame):
    # Split data
    X = preprocessed_df.drop('Survived', axis=1)
    y = preprocessed_df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Log parameters and metrics
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_metric("accuracy", acc)

    # Log model
    mlflow.sklearn.log_model(clf, "model")

mlflow ui --backend-store-uri sqlite:///mlflow.db

In [8]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('titanic_experiment')

<Experiment: artifact_location='file:///c:/Users/tman0/Documents/mlops-zoomcamp-project/mlruns/2', creation_time=1750548520758, experiment_id='2', last_update_time=1750548520758, lifecycle_stage='active', name='titanic_experiment', tags={}>

In [9]:
train_model(preprocessed_df)



In [12]:
import mlflow.pyfunc
model = mlflow.pyfunc.load_model(model_uri="models:/titanic_model/1")
