# TDT05 Machine Learning in practice challenge 2
***Victor Jørgensen and Hans Kristian Sande***

This notebook is heavily inspired by this tutorial: https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb

## Import modules

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import catboost
from catboost import CatBoostClassifier, Pool, metrics, cv
import pandas as pd
import numpy as np
print("No errors, all good!")

No errors, all good!


## 1.1 Data loading
Load data from csv files using pandas.

In [2]:
df_test = pd.read_csv("data/challenge2_test.csv")
# Uncomment line below to preview five first rows
#df_test.head()

df_train = pd.read_csv("data/challenge2_train.csv")
# Uncomment line below to preview five first rows
df_train.head()

Unnamed: 0,id,target,f0,f1,f2,f3,f4,f5,f6,f7,...,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28
0,0,0,1.0,gL,e,3.0,A,,0.0,6.0,...,0.5,0.0,3.0,R,328b0cf4e,0.834041,T,N,1.0,14.2364
1,1,0,0.0,Rj,c,1.0,A,7.0,1.0,4.0,...,0.4,0.0,1.0,,328b0cf4e,0.686021,T,N,1.0,
2,2,0,,In,a,1.0,A,10.0,1.0,6.0,...,0.5,1.0,3.0,G,0c67fcbbd,1.141271,T,N,3.0,
3,3,1,1.0,rA,c,3.0,A,7.0,1.0,1.0,...,0.6,1.0,1.0,G,fee4e3007,0.662382,T,N,3.0,
4,4,0,1.0,pE,c,3.0,A,7.0,0.0,6.0,...,0.5,0.0,1.0,B,587e040bd,-1.0,T,N,1.0,13.9537


## 1.2 Feature preparation
First of all let's check how many absent values do we have:

In [3]:
null_value_stats = df_train.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

f0      1459
f1      1487
f2      1439
f3      1488
f4      1498
f5     11617
f6      1490
f7      1525
f8      1490
f9      1489
f10     1501
f11      145
f12     1541
f13     1447
f14     1451
f15     1477
f16     1460
f17     9762
f18     1556
f19     1437
f20     1464
f21     1510
f22     1513
f23     1516
f24     2696
f25     1537
f26     1474
f27     1464
f28    13112
dtype: int64

In [203]:
outlier_indices = []
for feature in df_train.columns:
    if (df_train[feature].dtype == float):
        upper = df_train[feature].mean() + df_train[feature].std()*3
        lower = df_train[feature].mean() - df_train[feature].std()*3
        outlier_indices += list(df_train.index[df_train[feature] < lower])
        outlier_indices += list(df_train.index[df_train[feature] > upper])
df_train.drop(outlier_indices)

        
        
        

Unnamed: 0,id,target,f0,f1,f2,f3,f4,f5,f6,f7,...,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28
0,0,0,1.0,gL,e,3.0,A,-999.0,0.0,6.0,...,0.5,0.0,3.0,R,328b0cf4e,0.834041,T,N,1.0,14.2364
3,3,1,1.0,rA,c,3.0,A,7.0,1.0,1.0,...,0.6,1.0,1.0,G,fee4e3007,0.662382,T,N,3.0,-999.0000
4,4,0,1.0,pE,c,3.0,A,7.0,0.0,6.0,...,0.5,0.0,1.0,B,587e040bd,-1.000000,T,N,1.0,13.9537
6,6,0,0.0,fV,d,2.0,A,-999.0,1.0,1.0,...,0.6,1.0,3.0,R,50adfe104,0.601041,F,N,3.0,-999.0000
7,7,1,1.0,jf,a,3.0,A,11.0,0.0,5.0,...,0.4,0.0,5.0,R,ee60a6711,0.946705,T,N,1.0,-999.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49992,49992,0,1.0,sf,e,1.0,A,11.0,0.0,5.0,...,0.4,1.0,1.0,R,21b92d1a4,0.933073,-999,N,3.0,13.8648
49993,49993,0,1.0,OZ,d,2.0,B,7.0,1.0,6.0,...,0.4,0.0,5.0,R,81a17716c,0.902081,T,S,1.0,13.9539
49994,49994,0,0.0,aE,c,1.0,A,-999.0,0.0,4.0,...,0.5,1.0,6.0,-999,56133d013,0.563471,T,N,3.0,14.0088
49995,49995,1,0.0,tT,d,3.0,-999,11.0,0.0,1.0,...,0.6,1.0,1.0,R,72ccba7a4,0.890576,T,N,3.0,14.0466


As we se, null values range from ~3% to 15%. Almost all features have a substantial amount of null values, so let's fill them with some number way out of their distributions - so the model would be able to easily distinguish between them and take it into account:

In [195]:
df_train.fillna(-999, inplace=True)
df_test.fillna(-999, inplace=True)

Next we want to separate **feature** and **label** into **X** and **y**.

In [196]:
X = df_train.drop('target', axis=1)
y = df_train.target

Our features are of different data types. Some are numeric, others categorical and other strings. CatBoost allows us to treat these string features just as categorical ones.

In [197]:
X.dtypes

categorical_features_indices = np.where(X.dtypes != float)[0]
categorical_features_indices

array([ 0,  2,  3,  5,  9, 10, 11, 13, 14, 15, 16, 19, 23, 24, 26, 27])

## 1.3 Data splitting
Let's split the train data into training and validation sets

In [210]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.89, random_state=42)

X_test = df_test

## 2.1 Model training
Create model. Use default parameters, because they provide a good baseline almost all the time. The only thing we would like to specify here is custom_loss parameter, as this would give us an ability to see what's going on in terms of this competition metric - accuracy, as well as to be able to watch for logloss, as it would be more smooth on dataset of such size.

In [211]:
model = CatBoostClassifier(
    custom_loss=[metrics.AUC()],
    depth=3,
    iterations=5000,
    logging_level='Silent',
    use_best_model=True,
)
model_params = model.get_params()
print(model_params)

{'iterations': 5000, 'depth': 3, 'use_best_model': True, 'logging_level': 'Silent', 'custom_loss': ['AUC']}


In [212]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    #logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

## 2.2 Model cross validation
Cross validation is even better than validation. We use CV to evaluate how well our model performs, by splitting our data set into *k* split (k-fold split validation).

In [12]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Now we have values of our loss functions at each boosting step averaged by 3 folds, which should provide us with a more accurate estimation of our model performance:

In [13]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.82±0.00 on step 4578


In [14]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8216399546310625


## 2.3 Model applying
Apply our model to the test set. 

In [213]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 1 0 0 0 0 0 0 0 0]
[[0.70709618 0.29290382]
 [0.29189203 0.70810797]
 [0.81513173 0.18486827]
 [0.61203359 0.38796641]
 [0.79943292 0.20056708]
 [0.98595432 0.01404568]
 [0.71398094 0.28601906]
 [0.77214617 0.22785383]
 [0.53811961 0.46188039]
 [0.86471115 0.13528885]]


When we created the model we specified random_seed=42 parameter. By default CatBoost chooses a random value for seed.

In [20]:
model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0


We define parameters and Pool. The Pool stores information about the data set such as features, labels, categorical feature indices, weights and more.

In [119]:
params = {
    'iterations': 1000,
    'eval_metric': metrics.AUC(),
    'logging_level': 'Silent',
    'use_best_model': False,
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

## 3.1 Using the best model

In [120]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool, plot=True)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool, plot=True);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Simple model validation accuracy: 0.8186

Best model validation accuracy: 0.8171


## 3.2 Early stopping
Early stopping saves time and improves the quality. If you have a validation set it is always better to use early stopping.

In [121]:
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 10min 9s, sys: 2min 40s, total: 12min 50s
Wall time: 1min 41s


<catboost.core.CatBoostClassifier at 0x7fdabe381070>

In [122]:
%%time
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

CPU times: user 3min 27s, sys: 56.2 s, total: 4min 24s
Wall time: 35.7 s


<catboost.core.CatBoostClassifier at 0x7fdaebf4d4c0>

In [123]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 1000
Simple model validation accuracy: 0.8186

Early-stopped model tree count: 366
Early-stopped model validation accuracy: 0.8174


## 3.3 Feature importance
Can be useful to understand which features that make the greatest contribution to the result. CatBoost have a get_feature_importance method.

In [126]:
model = CatBoostClassifier(iterations=1000, depth=3, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

f13: 19.702077010546226
f2: 10.213826235824111
f16: 9.834959321873816
f3: 9.383652431760291
f1: 4.80140200049706
f10: 4.419148527063817
f25: 4.308352659626008
f21: 4.232181331040457
f19: 3.8946503565226984
f4: 3.6300916159068692
f9: 3.5220781108771244
f15: 3.2654248178058674
f18: 3.2168403258482647
f12: 3.076976966354835
f7: 2.5717001337610372
f8: 1.82452001451719
f26: 1.3445533787686919
f22: 1.0535828108025616
f27: 0.9432234955744927
f6: 0.8653204415351367
f28: 0.7901266389760926
f11: 0.6411747902429134
f17: 0.6347882759541463
f14: 0.5107386473714294
f20: 0.3892372967794393
f24: 0.3130349194278956
f23: 0.30807339863751665
f5: 0.2563407649350473
f0: 0.051923281168939794
id: 0.0


## 3.4 Eval metrics

In [127]:
model = CatBoostClassifier(iterations=100, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, [metrics.AUC()], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [128]:
print(eval_metrics['AUC'][:6])

[0.6178702980761996, 0.6899841170610492, 0.7113417042872108, 0.7251012574848756, 0.7334566237017517, 0.7359463828411371]


## 3.5 Save model

In [None]:
model.save_model('catboost_model.dump')

## 3.6 Load model

In [None]:
model = CatBoostClassifier()
model.load_model('catboost_model.dump')

## 4.1 Write predictions to file
Write the predictions made by the model to a csv file on the format (id, target). 

In [214]:
stacked_columns = np.column_stack((df_test["id"], predictions_probs[:, 1]))
df = pd.DataFrame(stacked_columns, columns=["id", "target"])
convert_dict={"id": int, "target" : float}
df = df.astype(convert_dict)
df.to_csv("predictions.csv", index = False)