# Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/xgboost/titanic_survival_classification.ipynb)

## Install dependencies

In [None]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" "matplotlib>=3.5.2,<3.6.0" "seaborn>=0.11.0,<0.12.0" "xgboost>=1.6.1,<1.7.0" scikit-learn shap==0.40.0
! pip install -U mlfoundry

## Initialize MLFoundry Client

In [None]:
import os
import getpass
import urllib.parse
import mlfoundry as mlf

In [None]:
MLF_API_KEY = os.environ.get('MLF_API_KEY')
if not MLF_API_KEY:
    print("Please get your API key from https://app.truefoundry.com/settings")
    MLF_API_KEY = getpass.getpass("Paste your API key and hit enter:")

In [None]:
client = mlf.get_client(api_key=MLF_API_KEY)

---

## Titanic Survival Classification

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import shap

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

### Load data and process it
- Impute null and missing values
- Map categorical values to numerical values

In [None]:
titanic = sns.load_dataset('titanic')
titanic.info()

In [None]:
titanic.head()

In [None]:
titanic.drop(['who', 'embark_town', 'alive'], inplace=True, axis=1)

In [None]:
titanic.isnull().sum()

In [None]:
pd.value_counts(titanic['embarked'])

In [None]:
pd.value_counts(titanic['deck'])

In [None]:
titanic['embarked'].fillna('S', inplace=True)
titanic['deck'].fillna('C', inplace=True)
titanic['age'].fillna(titanic['age'].median(), inplace=True)

In [None]:
titanic.isnull().values.any()

In [None]:
titanic.head()

In [None]:
label_encoders = {}
for column in ['sex', 'embarked', 'class', 'adult_male', 'deck', 'alone']:
    label_encoders[column] = LabelEncoder()
    titanic[column] = label_encoders[column].fit_transform(titanic[column])

In [None]:
X = titanic.copy().drop(['survived'], axis=1)
y = titanic['survived']

### Split Dataset into Training and Validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### Start MLFoundry Run

In [None]:
run = client.create_run(project_name='titanic-survival-example-project')
print('RUN 1 ID:', run.run_id)
print(f'You can track your runs live at https://app.truefoundry.com/mlfoundry')

### Setting tags

In [None]:
run.set_tags({'framework': 'xgboost', 'task': 'classification'})

### Training model, logging parameters and logging model

In [None]:
gbm = xgb.XGBClassifier(n_estimators=20, max_depth=5)
gbm.fit(X_train, y_train)
print('Accuracy:', gbm.score(X_test, y_test))

In [None]:
print(gbm.get_params())
run.log_params(gbm.get_params())
run.log_params({'classes': gbm.classes_, 'features': gbm.n_features_in_})

In [None]:
run.log_model(gbm, framework=mlf.ModelFramework.XGBOOST)

In [None]:
import pickle

with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [None]:
run.log_artifact('label_encoders.pkl', artifact_path='extra_files/')

### Computing predictions

In [None]:
y_pred_train = gbm.predict(X_train)
y_pred_test = gbm.predict(X_test)

### Logging metrics

In [None]:
metrics = {
    'train/accuracy': accuracy_score(y_train, y_pred_train),
    'train/precision': precision_score(y_train, y_pred_train),
    'train/recall': recall_score(y_train, y_pred_train),
    'train/f1': f1_score(y_train, y_pred_train),
    'test/accuracy': accuracy_score(y_test, y_pred_test),
    'test/precision': precision_score(y_test, y_pred_test),
    'test/recall': recall_score(y_test, y_pred_test),
    'test/f1': f1_score(y_test, y_pred_test),
}
print(metrics)
run.log_metrics(metrics)

### Log the dataset

In [None]:
run.log_dataset(
    dataset_name='train',
    features=X_train,
    predictions=y_pred_train,
    actuals=y_train,
)

In [None]:
run.log_dataset(
    dataset_name='test',
    features=X_test,
    predictions=y_pred_test,
    actuals=y_test,
)

In [None]:
run.end()