## Try this notebook in Google Colab

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1JwZRGd-NBVdQCC8DHU_hAUcpj4-c2HWj?usp=sharing)

In [None]:
!pip install seaborn
!pip install xgboost
!pip install -U mlfoundry

In [None]:
import seaborn as sns 

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

import os
import getpass
import urllib.parse
import mlfoundry as mlf

In [None]:
TFY_URL = os.environ.get('TFY_URL', 'https://app.truefoundry.com/')
TFY_API_KEY = os.environ.get('TFY_API_KEY')
if not TFY_API_KEY:
    print(f'Paste your TrueFoundry API key\nYou can find it over at {urllib.parse.urljoin(TFY_URL, "settings")}')
    TFY_API_KEY = getpass.getpass()

In [None]:
titanic = sns.load_dataset('titanic')

titanic

In [None]:
X_full = titanic.copy()

In [None]:
X_full['nulls'] = X_full.deck.isnull().astype('int') + X_full.age.isnull().astype('int')

X_full['deck_mapped'] = X_full['deck'].astype(str).str[0] # this captures the letter
# this transforms the letters into numbers
deck_dict = {k:i for i, k in enumerate(X_full.deck.unique())} 
deck_dict.pop(np.nan)
X_full.loc[:, 'deck_mapped'] = X_full.loc[:, 'deck_mapped'].map(deck_dict)

X_full.drop(['age', 'deck'], inplace = True, axis = 1)

fare_mean = X_full[X_full.pclass == 3].fare.mean()

X_full['fare'].fillna(fare_mean, inplace = True)
X_full.deck_mapped.fillna(0, inplace = True)

X_full[X_full['pclass'] == 1].embark_town.value_counts()

X_full['embarked'].fillna('S', inplace = True)

X_full.drop(['embark_town'], inplace = True, axis = 1)

X_full.drop(['who'], axis = 1, inplace = True)

In [None]:
X_dummies = pd.get_dummies(X_full, columns = ['sex', 'nulls', 'deck_mapped', 'embarked', "class", "adult_male", "alive", "alone"], drop_first= True)

In [None]:
X_dummies

In [None]:
X = X_dummies.drop(['survived'], axis = 1)
y = X_dummies["survived"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .3,
                                                    random_state = 5,
                                                   stratify = y)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(X_train, y_train)

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': range(8, 20),
    'max_depth': range(6, 10),
    'learning_rate': [.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}

# Instantiate the regressor: gbm
gbm = XGBClassifier(n_estimators=1, max_depth=2)

# Fit randomized_mse to the data
gbm.fit(X_train, y_train)

print("accuracy found: ", gbm.score(X_test, y_test))

In [None]:
mlf_api = mlf.get_client(api_key=TFY_API_KEY)

mlf_run = mlf_api.create_run(project_name='Titanic-Survival-prediction', run_name="XGBoost-Classification")

In [None]:
xgb_preds = gbm.predict(X_test)
metrics_dict = {
    "Accuracy": accuracy_score(y_test, xgb_preds),
    "Precision": precision_score(y_test, xgb_preds),
    "Recall": recall_score(y_test, xgb_preds),
}

mlf_run.log_metrics(metrics_dict)

In [None]:
params = gbm.get_params()
params_dict = {
    "objective": params['objective'],
    'booster': params["booster"],
    'learning_rate': params["learning_rate"],
    'max_depth': params["max_depth"]
}

mlf_run.log_params(params_dict)

In [None]:
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = gbm.predict(X_test)

mlf_run.log_dataset(
    dataset_name = 'test',
    features = X_test_df[list(X_test.columns)],
    predictions = X_test_df['predictions'],
    actuals = X_test_df['targets'],
    only_stats = False,   
)

In [None]:
mlf_run.end()