In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import mlflow
from sklearn.ensemble import RandomForestClassifier
import sklearn

from sklearn.metrics import accuracy_score
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.models import infer_signature


In [2]:
mlflow.set_tracking_uri("sqlite:///../mlflow.db")
      
client = mlflow.tracking.MlflowClient()

In [3]:
dataset_path = "../data/titanic/"
train_df = pd.read_csv(os.path.join(dataset_path, "train.csv"))
test_df = pd.read_csv(os.path.join(dataset_path, "test.csv"))


print(train_df.shape)
print(test_df.shape)


(891, 12)
(418, 11)


In [4]:
nullseries = train_df.isnull().sum()
print(nullseries[nullseries > 0])

Age         177
Cabin       687
Embarked      2
dtype: int64


In [5]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def preprocess_dataset(df, test=False):
    in_features = ['Pclass', 'Sex', 'SibSp', 'Parch']
    if test:
        return pd.get_dummies(df[in_features]) 
    
    out_features = ['Survived']
    return pd.get_dummies(df[in_features]), df[out_features]

def kDataSplit(k, i, data):
    val_ratio = 1.0 / k
    interval = len(data) * val_ratio
    interval = np.floor(interval).astype(np.int16)

    splits = []
    pool = np.array(range(len(data)))
    for j in range(int(1/val_ratio)):
        split = np.random.choice(pool, size=interval, replace=False)
        split = split.tolist()
        splits.append(split)
        pool = pool[np.isin(pool, split, invert=True)]
    #for j in range(len(splits)):
    #    print(len(splits[j]))

    #K-fold Cross-Validation    
    val_pool = splits[i]
    train_pool = []
    for j in range(len(splits)):
        if i == j:
            continue
        train_pool.append(splits[j])
    train_pool = np.hstack(train_pool).tolist()

    train_data = data.iloc(axis=0)[train_pool]
    val_data = data.iloc(axis=0)[val_pool]
    return train_data, val_data

def kCrossVal(k, model_class, model_params, data):
    model = model_class(**model_params)

    sum_score = 0.0

    for i in range(k):
        train_data, val_data = kDataSplit(k, i, data)
        
        X_train, y_train = preprocess_dataset(train_data)
        X_val, y_val = preprocess_dataset(val_data)

        model.fit(X_train, np.ravel(y_train))
        val_pred = model.predict(X_val)
        acc = accuracy_score(y_true=y_val, y_pred=val_pred)
        sum_score += acc
    return sum_score / k

In [5]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

def tune_model(df, n_trials, mlflow_exp_name):
    mlflow.set_experiment(mlflow_exp_name)
    experiment = mlflow.get_experiment_by_name(mlflow_exp_name)

    def objective(trial):  
        run = client.create_run(experiment.experiment_id)

        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 5),
            "random_state": 1,
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 7),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10)
        }
        for key in params:
            client.log_param(run.info.run_id, key, params[key])

        acc = kCrossVal(k=5,
                        model_class=RandomForestClassifier,
                        model_params=params,
                        data=df)

        client.log_metric(run.info.run_id, "accuracy", acc)
        return acc

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, n_jobs=-1)
    return study.best_trial.params

In [21]:
import uuid
mlflow_exp_name = "titanic-hyp-" + str(uuid.uuid4()).split("-")[0]
best_params = tune_model(train_df, n_trials=100, mlflow_exp_name=mlflow_exp_name)

mlflow.set_experiment("titanic")
mlflow.sklearn.autolog(disable=True)
with mlflow.start_run(run_name='rf_baseline'):
    mlflow.set_tag("model_name", "RF")    

    model = RandomForestClassifier(**best_params)
    param = model.get_params()
    
    acc = kCrossVal(k = 5, 
                    model_class = RandomForestClassifier, 
                    model_params = param, 
                    data = train_df)

    X_train, y_train = preprocess_dataset(train_df)
    model.fit(X_train, np.ravel(y_train))

    joined_train = pd.concat((X_train,y_train),axis=1)
    mlflow_train_dataset: PandasDataset = mlflow.data.from_pandas(joined_train)
    mlflow.log_input(mlflow_train_dataset, context="training")
    mlflow.log_params(params=param)
    mlflow.log_metric("accuracy", acc)
    signature = infer_signature(model_input=X_train, model_output=y_train)
    mlflow.sklearn.log_model(model, "sk_models", signature=signature)

    print(acc)
        

2023/09/14 12:26:24 INFO mlflow.tracking.fluent: Experiment with name 'titanic-11d402cf' does not exist. Creating a new experiment.


  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  return _infer_schema(self._df)
  inputs = _infer_schema(model_input) if model_input is not None else None
  outputs = _infer_schema(model_output) if model_output is not None else None


0.8101123595505617


In [13]:
import filecmp
old_submission = '../kaggle/titanic/submission1.csv'
new_submission = '../kaggle/titanic/submission.csv'

if os.path.exists(new_submission):
    os.remove(new_submission)

logged_model = 'runs:/278ca5744d7a4c9fb6fc3194e4619dfc/sk_models'

ml_model = mlflow.sklearn.load_model(logged_model)

X_test = preprocess_dataset(test_df, test=True)

predictions = ml_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_df.PassengerId,
                       'Survived': predictions})
output.to_csv(new_submission, index=False)

is_diff = filecmp.cmp(old_submission, new_submission, shallow=False)

print("New submission same as old? " + str(is_diff))
print("Your submission was successfully saved!")

New submission same as old? True
Your submission was successfully saved!


In [None]:



X_test = preprocess_dataset(test_df, test=True)

predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_df.PassengerId,
                       'Survived': predictions})
output.to_csv('../kaggle/titanic/submission.csv', index=False)
print("Your submission was successfully saved!")