In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 99kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


In [2]:
!pip install ipywidgets



In [3]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


Data import

In [4]:
from catboost.datasets import titanic
import numpy as np

train_df, test_df = titanic()

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Feature processing

Checking the null values and fill them with nan

In [5]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [6]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Separate features and labels

In [7]:
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

Different types of variables, like numeric and categorical

In [8]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Train/test split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test_df

Catboost specific libraries

In [10]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

Training

In [11]:
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Silent'
)

In [12]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.028683
0:	learn: 0.6739988	test: 0.6742630	best: 0.6742630 (0)	total: 57.3ms	remaining: 57.3s
1:	learn: 0.6589013	test: 0.6592240	best: 0.6592240 (1)	total: 60.3ms	remaining: 30.1s
2:	learn: 0.6421502	test: 0.6426778	best: 0.6426778 (2)	total: 66.3ms	remaining: 22s
3:	learn: 0.6297276	test: 0.6302310	best: 0.6302310 (3)	total: 70.4ms	remaining: 17.5s
4:	learn: 0.6147184	test: 0.6198228	best: 0.6198228 (4)	total: 75.5ms	remaining: 15s
5:	learn: 0.6017730	test: 0.6073627	best: 0.6073627 (5)	total: 80.8ms	remaining: 13.4s
6:	learn: 0.5885309	test: 0.5956000	best: 0.5956000 (6)	total: 86.1ms	remaining: 12.2s
7:	learn: 0.5783200	test: 0.5858523	best: 0.5858523 (7)	total: 93ms	remaining: 11.5s
8:	learn: 0.5665895	test: 0.5743842	best: 0.5743842 (8)	total: 97.9ms	remaining: 10.8s
9:	learn: 0.5575381	test: 0.5662283	best: 0.5662283 (9)	total: 105ms	remaining: 10.4s
10:	learn: 0.5491045	test: 0.5575176	best: 0.5575176 (10)	total: 111ms	remaining: 9.96s
11:	learn: 0.542388

Cross validation

In [13]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': 'Logloss'
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [14]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.02 on step 543


In [15]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8260381593714928


Applicaton of model

In [16]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.85473931 0.14526069]
 [0.76313031 0.23686969]
 [0.88972889 0.11027111]
 [0.87876173 0.12123827]
 [0.3611047  0.6388953 ]
 [0.90513381 0.09486619]
 [0.33434185 0.66565815]
 [0.78468564 0.21531436]
 [0.39429048 0.60570952]
 [0.94047549 0.05952451]]


Catboost features

In [17]:
model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0


In [18]:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

Using the best model

In [19]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.8027

Best model validation accuracy: 0.8251


Early stopping

In [20]:
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 6.84 s, sys: 994 ms, total: 7.83 s
Wall time: 2.3 s


In [21]:
%%time
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

CPU times: user 995 ms, sys: 153 ms, total: 1.15 s
Wall time: 367 ms


In [22]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.8027

Early-stopped model tree count: 82
Early-stopped model validation accuracy: 0.8072


Using pre-training results for training

In [23]:
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline);

Snapshot support

In [24]:
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.1,
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)

0:	learn: 0.8053892	test: 0.7937220	best: 0.7937220 (0)	total: 1.98ms	remaining: 7.94ms
1:	learn: 0.8008982	test: 0.7982063	best: 0.7982063 (1)	total: 3.82ms	remaining: 5.73ms
2:	learn: 0.8008982	test: 0.7937220	best: 0.7982063 (1)	total: 5.65ms	remaining: 3.77ms
3:	learn: 0.8113772	test: 0.7892377	best: 0.7982063 (1)	total: 7.4ms	remaining: 1.85ms
4:	learn: 0.8173653	test: 0.8026906	best: 0.8026906 (4)	total: 9.02ms	remaining: 0us

bestTest = 0.802690583
bestIteration = 4

5:	learn: 0.8173653	test: 0.8026906	best: 0.8026906 (4)	total: 11ms	remaining: 8.03ms
6:	learn: 0.8248503	test: 0.8026906	best: 0.8026906 (4)	total: 12.7ms	remaining: 5.52ms
7:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 14.7ms	remaining: 3.76ms
8:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 16.3ms	remaining: 1.82ms
9:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 19.6ms	remaining: 0us

bestTest = 0.802690583
bestIteration = 4



User defined objective function

In [25]:
class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers which have only __len__ and __getitem__ defined).
        # weights parameter can be None.
        #
        # To understand what these parameters mean, assume that there is
        # a subset of your dataset that is currently being processed.
        # approxes contains current predictions for this subset,
        # targets contains target values you provided with the dataset.
        #
        # This function should return a list of pairs (der1, der2), where
        # der1 is the first derivative of the loss function with respect
        # to the predicted value, and der2 is the second derivative.
        #
        # In our case, logloss is defined by the following formula:
        # target * log(sigmoid(approx)) + (1 - target) * (1 - sigmoid(approx))
        # where sigmoid(x) = 1 / (1 + e^(-x)).
        
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

In [26]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(), 
    eval_metric="Logloss"
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

0:	learn: 0.6827074	total: 12.4ms	remaining: 112ms
1:	learn: 0.6723302	total: 22.7ms	remaining: 90.6ms
2:	learn: 0.6619449	total: 33.6ms	remaining: 78.3ms
3:	learn: 0.6521466	total: 49.1ms	remaining: 73.6ms
4:	learn: 0.6435227	total: 60ms	remaining: 60ms
5:	learn: 0.6353848	total: 72.6ms	remaining: 48.4ms
6:	learn: 0.6277210	total: 85.8ms	remaining: 36.8ms
7:	learn: 0.6210282	total: 97.3ms	remaining: 24.3ms
8:	learn: 0.6141958	total: 109ms	remaining: 12.1ms
9:	learn: 0.6073236	total: 122ms	remaining: 0us


User defined metric function

In [27]:
class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

In [28]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function="Logloss",
    eval_metric=LoglossMetric()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.5521578	total: 4.62ms	remaining: 41.6ms
1:	learn: 0.4885686	total: 8.61ms	remaining: 34.4ms
2:	learn: 0.4607664	total: 12.8ms	remaining: 29.8ms
3:	learn: 0.4418819	total: 16.9ms	remaining: 25.4ms
4:	learn: 0.4278162	total: 21.6ms	remaining: 21.6ms
5:	learn: 0.4151036	total: 26ms	remaining: 17.4ms
6:	learn: 0.4099336	total: 30.3ms	remaining: 13ms
7:	learn: 0.4095363	total: 33.8ms	remaining: 8.46ms
8:	learn: 0.4032867	total: 40.1ms	remaining: 4.46ms
9:	learn: 0.3929586	total: 44.4ms	remaining: 0us


Staged predict

In [29]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
ntree_start, ntree_end, eval_period = 3, 9, 2
predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)
for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

First class probabilities using the first 3 trees: [0.53597869 0.41039128 0.42057479 0.64281031 0.46576685]
First class probabilities using the first 5 trees: [0.63722688 0.42492029 0.46209302 0.70926021 0.44280772]
First class probabilities using the first 7 trees: [0.66964764 0.42409144 0.46124982 0.76101033 0.47205986]


Feature importances

In [30]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 59.004092014268586
Pclass: 16.340887169747035
Ticket: 6.028107169932204
Cabin: 3.8347242202560192
Fare: 3.712969667934384
Age: 3.484451204182482
Parch: 3.378089740355865
Embarked: 2.3139994072899555
SibSp: 1.9026794060334504
PassengerId: 0.0
Name: 0.0


Evaluation metrics

In [31]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [32]:
print(eval_metrics['AUC'][:6])

[0.8627368774106994, 0.8623176253563642, 0.8602213650846889, 0.8514170719436525, 0.8495723629045783, 0.8569092738554419]


Learning process comparison

In [33]:
model1 = CatBoostClassifier(iterations=10, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=10, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

In [34]:
from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Saving the model

In [35]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

Tuning the parameters

In [36]:
!pip install hyperopt



In [37]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [38]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

100%|██████████| 50/50 [35:37<00:00, 42.76s/it, best loss: 0.1661054994388328]
{'l2_leaf_reg': 1.0, 'learning_rate': 0.030276027601240763}


Get cv data with best parameters

In [39]:
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

In [40]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8338945005611672


Retrain on train data

In [41]:
model.fit(X, y, cat_features=categorical_features_indices)

<catboost.core.CatBoostClassifier at 0x7fa3ddad65c0>

Preparing csv file

In [43]:
import pandas as pd
final_csv = pd.DataFrame()
final_csv['PassengerId'] = X_test['PassengerId']
final_csv['Survived'] = model.predict(X_test)

In [45]:
final_csv.to_csv('final_csv.csv', index=False)