# TDT05 Machine Learning in practice challenge 2
***Victor Jørgensen and Hans Kristian Sande***



## Import modules

In [2]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import catboost
from catboost import CatBoostClassifier, Pool, metrics, cv
import lightgbm
#import imblearn
#import lime
#import seaborn
#import mlxtend
#import tsfresh
#import rpy2
import graphviz
#import pydot
import pandas as pd
import numpy as np
print("No errors, all good!")

No errors, all good!


## 1.1 Data loading
Load data from csv files using pandas.

In [3]:
df_test = pd.read_csv("data/challenge2_test.csv")
# Uncomment line below to preview five first rows
#df_test.head()

df_train = pd.read_csv("data/challenge2_train.csv")
# Uncomment line below to preview five first rows
df_train.head()

Unnamed: 0,id,target,f0,f1,f2,f3,f4,f5,f6,f7,...,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28
0,0,0,1.0,gL,e,3.0,A,,0.0,6.0,...,0.5,0.0,3.0,R,328b0cf4e,0.834041,T,N,1.0,14.2364
1,1,0,0.0,Rj,c,1.0,A,7.0,1.0,4.0,...,0.4,0.0,1.0,,328b0cf4e,0.686021,T,N,1.0,
2,2,0,,In,a,1.0,A,10.0,1.0,6.0,...,0.5,1.0,3.0,G,0c67fcbbd,1.141271,T,N,3.0,
3,3,1,1.0,rA,c,3.0,A,7.0,1.0,1.0,...,0.6,1.0,1.0,G,fee4e3007,0.662382,T,N,3.0,
4,4,0,1.0,pE,c,3.0,A,7.0,0.0,6.0,...,0.5,0.0,1.0,B,587e040bd,-1.0,T,N,1.0,13.9537


## 1.2 Feature preparation
First of all let's check how many absent values do we have:

In [4]:
null_value_stats = df_train.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

f0      1459
f1      1487
f2      1439
f3      1488
f4      1498
f5     11617
f6      1490
f7      1525
f8      1490
f9      1489
f10     1501
f11      145
f12     1541
f13     1447
f14     1451
f15     1477
f16     1460
f17     9762
f18     1556
f19     1437
f20     1464
f21     1510
f22     1513
f23     1516
f24     2696
f25     1537
f26     1474
f27     1464
f28    13112
dtype: int64

As we se, null values range from ~3% to 15%. Almost all features have a substantial amount of null values, so let's fill them with some number way out of their distributions - so the model would be able to easily distinguish between them and take it into account:

In [5]:
df_train.fillna(-999, inplace=True)
df_test.fillna(-999, inplace=True)

Next we want to separate **feature** and **label** into **X** and **y**.

In [6]:
X = df_train.drop('target', axis=1)
y = df_train.target

Our features are of different data types. Some are numeric, others categorical and other strings. CatBoost allows us to treat these string features just as categorical ones.

In [7]:
X.dtypes

categorical_features_indices = np.where(X.dtypes != float)[0]
categorical_features_indices

array([ 0,  2,  3,  5,  9, 10, 11, 13, 14, 15, 16, 19, 23, 24, 26, 27])

## 1.3 Data splitting
Let's split the train data into training and validation sets

In [17]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = df_test

## 2.1 Model training
Create model. Use default parameters, because they provide a good baseline almost all the time. The only thing we would like to specify here is custom_loss parameter, as this would give us an ability to see what's going on in terms of this competition metric - accuracy, as well as to be able to watch for logloss, as it would be more smooth on dataset of such size.

In [9]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [10]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.07757
0:	learn: 0.6528843	test: 0.6530412	best: 0.6530412 (0)	total: 130ms	remaining: 2m 9s
1:	learn: 0.6192287	test: 0.6197202	best: 0.6197202 (1)	total: 216ms	remaining: 1m 47s
2:	learn: 0.5911788	test: 0.5909496	best: 0.5909496 (2)	total: 284ms	remaining: 1m 34s
3:	learn: 0.5689674	test: 0.5687590	best: 0.5687590 (3)	total: 357ms	remaining: 1m 28s
4:	learn: 0.5473569	test: 0.5469507	best: 0.5469507 (4)	total: 449ms	remaining: 1m 29s
5:	learn: 0.5316976	test: 0.5316004	best: 0.5316004 (5)	total: 527ms	remaining: 1m 27s
6:	learn: 0.5174634	test: 0.5175024	best: 0.5175024 (6)	total: 602ms	remaining: 1m 25s
7:	learn: 0.5059621	test: 0.5063024	best: 0.5063024 (7)	total: 692ms	remaining: 1m 25s
8:	learn: 0.4950414	test: 0.4952938	best: 0.4952938 (8)	total: 787ms	remaining: 1m 26s
9:	learn: 0.4858034	test: 0.4864824	best: 0.4864824 (9)	total: 877ms	remaining: 1m 26s
10:	learn: 0.4806487	test: 0.4814626	best: 0.4814626 (10)	total: 911ms	remaining: 1m 21s
11:	learn: 0.

95:	learn: 0.4099047	test: 0.4186578	best: 0.4186578 (95)	total: 9.1s	remaining: 1m 25s
96:	learn: 0.4097566	test: 0.4185728	best: 0.4185728 (96)	total: 9.18s	remaining: 1m 25s
97:	learn: 0.4096507	test: 0.4185531	best: 0.4185531 (97)	total: 9.29s	remaining: 1m 25s
98:	learn: 0.4094960	test: 0.4185038	best: 0.4185038 (98)	total: 9.38s	remaining: 1m 25s
99:	learn: 0.4093467	test: 0.4184410	best: 0.4184410 (99)	total: 9.49s	remaining: 1m 25s
100:	learn: 0.4091661	test: 0.4183378	best: 0.4183378 (100)	total: 9.6s	remaining: 1m 25s
101:	learn: 0.4090262	test: 0.4182449	best: 0.4182449 (101)	total: 9.69s	remaining: 1m 25s
102:	learn: 0.4089050	test: 0.4181440	best: 0.4181440 (102)	total: 9.8s	remaining: 1m 25s
103:	learn: 0.4087671	test: 0.4180294	best: 0.4180294 (103)	total: 9.92s	remaining: 1m 25s
104:	learn: 0.4085997	test: 0.4179879	best: 0.4179879 (104)	total: 10s	remaining: 1m 25s
105:	learn: 0.4083509	test: 0.4178504	best: 0.4178504 (105)	total: 10.1s	remaining: 1m 25s
106:	learn: 0.

187:	learn: 0.3977213	test: 0.4141288	best: 0.4141288 (187)	total: 18.2s	remaining: 1m 18s
188:	learn: 0.3975851	test: 0.4140615	best: 0.4140615 (188)	total: 18.3s	remaining: 1m 18s
189:	learn: 0.3974553	test: 0.4141235	best: 0.4140615 (188)	total: 18.4s	remaining: 1m 18s
190:	learn: 0.3974152	test: 0.4141138	best: 0.4140615 (188)	total: 18.5s	remaining: 1m 18s
191:	learn: 0.3973207	test: 0.4140852	best: 0.4140615 (188)	total: 18.6s	remaining: 1m 18s
192:	learn: 0.3971851	test: 0.4140989	best: 0.4140615 (188)	total: 18.7s	remaining: 1m 18s
193:	learn: 0.3970848	test: 0.4140370	best: 0.4140370 (193)	total: 18.8s	remaining: 1m 18s
194:	learn: 0.3969717	test: 0.4140366	best: 0.4140366 (194)	total: 18.9s	remaining: 1m 18s
195:	learn: 0.3968454	test: 0.4140479	best: 0.4140366 (194)	total: 19s	remaining: 1m 17s
196:	learn: 0.3967113	test: 0.4139547	best: 0.4139547 (196)	total: 19.1s	remaining: 1m 17s
197:	learn: 0.3965775	test: 0.4139252	best: 0.4139252 (197)	total: 19.2s	remaining: 1m 17s
1

279:	learn: 0.3886016	test: 0.4130659	best: 0.4129144 (259)	total: 27.8s	remaining: 1m 11s
280:	learn: 0.3885594	test: 0.4130551	best: 0.4129144 (259)	total: 27.9s	remaining: 1m 11s
281:	learn: 0.3884283	test: 0.4130350	best: 0.4129144 (259)	total: 28s	remaining: 1m 11s
282:	learn: 0.3883366	test: 0.4129739	best: 0.4129144 (259)	total: 28.1s	remaining: 1m 11s
283:	learn: 0.3882875	test: 0.4129834	best: 0.4129144 (259)	total: 28.2s	remaining: 1m 11s
284:	learn: 0.3881463	test: 0.4129680	best: 0.4129144 (259)	total: 28.3s	remaining: 1m 10s
285:	learn: 0.3880964	test: 0.4129604	best: 0.4129144 (259)	total: 28.4s	remaining: 1m 10s
286:	learn: 0.3880078	test: 0.4129354	best: 0.4129144 (259)	total: 28.4s	remaining: 1m 10s
287:	learn: 0.3879078	test: 0.4129664	best: 0.4129144 (259)	total: 28.5s	remaining: 1m 10s
288:	learn: 0.3878618	test: 0.4129686	best: 0.4129144 (259)	total: 28.6s	remaining: 1m 10s
289:	learn: 0.3877678	test: 0.4129749	best: 0.4129144 (259)	total: 28.7s	remaining: 1m 10s
2

371:	learn: 0.3803204	test: 0.4126764	best: 0.4126764 (371)	total: 37.6s	remaining: 1m 3s
372:	learn: 0.3802487	test: 0.4126743	best: 0.4126743 (372)	total: 37.7s	remaining: 1m 3s
373:	learn: 0.3801591	test: 0.4126763	best: 0.4126743 (372)	total: 37.8s	remaining: 1m 3s
374:	learn: 0.3800758	test: 0.4126284	best: 0.4126284 (374)	total: 37.9s	remaining: 1m 3s
375:	learn: 0.3800049	test: 0.4126288	best: 0.4126284 (374)	total: 38s	remaining: 1m 3s
376:	learn: 0.3799467	test: 0.4126535	best: 0.4126284 (374)	total: 38.1s	remaining: 1m 3s
377:	learn: 0.3798807	test: 0.4126619	best: 0.4126284 (374)	total: 38.2s	remaining: 1m 2s
378:	learn: 0.3798367	test: 0.4126518	best: 0.4126284 (374)	total: 38.4s	remaining: 1m 2s
379:	learn: 0.3797441	test: 0.4126390	best: 0.4126284 (374)	total: 38.4s	remaining: 1m 2s
380:	learn: 0.3796483	test: 0.4126779	best: 0.4126284 (374)	total: 38.5s	remaining: 1m 2s
381:	learn: 0.3795180	test: 0.4126993	best: 0.4126284 (374)	total: 38.6s	remaining: 1m 2s
382:	learn: 

464:	learn: 0.3731880	test: 0.4129258	best: 0.4126080 (399)	total: 47.3s	remaining: 54.4s
465:	learn: 0.3730321	test: 0.4129157	best: 0.4126080 (399)	total: 47.4s	remaining: 54.4s
466:	learn: 0.3728978	test: 0.4129953	best: 0.4126080 (399)	total: 47.6s	remaining: 54.3s
467:	learn: 0.3727734	test: 0.4130102	best: 0.4126080 (399)	total: 47.7s	remaining: 54.2s
468:	learn: 0.3727021	test: 0.4130118	best: 0.4126080 (399)	total: 47.8s	remaining: 54.1s
469:	learn: 0.3726692	test: 0.4130172	best: 0.4126080 (399)	total: 47.9s	remaining: 54s
470:	learn: 0.3726441	test: 0.4130135	best: 0.4126080 (399)	total: 48s	remaining: 53.9s
471:	learn: 0.3725538	test: 0.4129993	best: 0.4126080 (399)	total: 48.1s	remaining: 53.8s
472:	learn: 0.3724595	test: 0.4129645	best: 0.4126080 (399)	total: 48.2s	remaining: 53.7s
473:	learn: 0.3723410	test: 0.4129694	best: 0.4126080 (399)	total: 48.3s	remaining: 53.6s
474:	learn: 0.3722603	test: 0.4129557	best: 0.4126080 (399)	total: 48.4s	remaining: 53.5s
475:	learn: 0.

556:	learn: 0.3663219	test: 0.4129905	best: 0.4126080 (399)	total: 57.7s	remaining: 45.9s
557:	learn: 0.3662569	test: 0.4129732	best: 0.4126080 (399)	total: 57.8s	remaining: 45.8s
558:	learn: 0.3661341	test: 0.4130038	best: 0.4126080 (399)	total: 58s	remaining: 45.7s
559:	learn: 0.3660171	test: 0.4130398	best: 0.4126080 (399)	total: 58.1s	remaining: 45.6s
560:	learn: 0.3659359	test: 0.4130478	best: 0.4126080 (399)	total: 58.2s	remaining: 45.5s
561:	learn: 0.3658079	test: 0.4130277	best: 0.4126080 (399)	total: 58.3s	remaining: 45.4s
562:	learn: 0.3657613	test: 0.4130248	best: 0.4126080 (399)	total: 58.4s	remaining: 45.3s
563:	learn: 0.3656884	test: 0.4130061	best: 0.4126080 (399)	total: 58.5s	remaining: 45.2s
564:	learn: 0.3655938	test: 0.4130301	best: 0.4126080 (399)	total: 58.6s	remaining: 45.1s
565:	learn: 0.3654905	test: 0.4130130	best: 0.4126080 (399)	total: 58.7s	remaining: 45s
566:	learn: 0.3654267	test: 0.4130059	best: 0.4126080 (399)	total: 58.8s	remaining: 44.9s
567:	learn: 0.

649:	learn: 0.3591934	test: 0.4131074	best: 0.4126080 (399)	total: 1m 8s	remaining: 36.6s
650:	learn: 0.3591560	test: 0.4131084	best: 0.4126080 (399)	total: 1m 8s	remaining: 36.6s
651:	learn: 0.3590899	test: 0.4131103	best: 0.4126080 (399)	total: 1m 8s	remaining: 36.4s
652:	learn: 0.3590326	test: 0.4131020	best: 0.4126080 (399)	total: 1m 8s	remaining: 36.3s
653:	learn: 0.3589550	test: 0.4130912	best: 0.4126080 (399)	total: 1m 8s	remaining: 36.2s
654:	learn: 0.3589042	test: 0.4130949	best: 0.4126080 (399)	total: 1m 8s	remaining: 36.1s
655:	learn: 0.3588826	test: 0.4130894	best: 0.4126080 (399)	total: 1m 8s	remaining: 36s
656:	learn: 0.3588682	test: 0.4130993	best: 0.4126080 (399)	total: 1m 8s	remaining: 35.9s
657:	learn: 0.3588120	test: 0.4130791	best: 0.4126080 (399)	total: 1m 8s	remaining: 35.8s
658:	learn: 0.3587450	test: 0.4130485	best: 0.4126080 (399)	total: 1m 8s	remaining: 35.7s
659:	learn: 0.3587130	test: 0.4130362	best: 0.4126080 (399)	total: 1m 9s	remaining: 35.6s
660:	learn: 

741:	learn: 0.3532570	test: 0.4133472	best: 0.4126080 (399)	total: 1m 18s	remaining: 27.2s
742:	learn: 0.3531786	test: 0.4133434	best: 0.4126080 (399)	total: 1m 18s	remaining: 27.1s
743:	learn: 0.3530818	test: 0.4133476	best: 0.4126080 (399)	total: 1m 18s	remaining: 27s
744:	learn: 0.3530058	test: 0.4133574	best: 0.4126080 (399)	total: 1m 18s	remaining: 26.9s
745:	learn: 0.3529714	test: 0.4133680	best: 0.4126080 (399)	total: 1m 18s	remaining: 26.8s
746:	learn: 0.3529096	test: 0.4133561	best: 0.4126080 (399)	total: 1m 18s	remaining: 26.7s
747:	learn: 0.3528679	test: 0.4133333	best: 0.4126080 (399)	total: 1m 18s	remaining: 26.6s
748:	learn: 0.3528288	test: 0.4133422	best: 0.4126080 (399)	total: 1m 19s	remaining: 26.5s
749:	learn: 0.3527521	test: 0.4133128	best: 0.4126080 (399)	total: 1m 19s	remaining: 26.4s
750:	learn: 0.3527386	test: 0.4133275	best: 0.4126080 (399)	total: 1m 19s	remaining: 26.3s
751:	learn: 0.3526445	test: 0.4133081	best: 0.4126080 (399)	total: 1m 19s	remaining: 26.2s
7

833:	learn: 0.3468701	test: 0.4135492	best: 0.4126080 (399)	total: 1m 29s	remaining: 17.8s
834:	learn: 0.3468128	test: 0.4135498	best: 0.4126080 (399)	total: 1m 29s	remaining: 17.6s
835:	learn: 0.3468045	test: 0.4135685	best: 0.4126080 (399)	total: 1m 29s	remaining: 17.5s
836:	learn: 0.3467119	test: 0.4135756	best: 0.4126080 (399)	total: 1m 29s	remaining: 17.4s
837:	learn: 0.3466561	test: 0.4135667	best: 0.4126080 (399)	total: 1m 29s	remaining: 17.3s
838:	learn: 0.3466323	test: 0.4135647	best: 0.4126080 (399)	total: 1m 29s	remaining: 17.2s
839:	learn: 0.3465609	test: 0.4136246	best: 0.4126080 (399)	total: 1m 29s	remaining: 17.1s
840:	learn: 0.3465442	test: 0.4136176	best: 0.4126080 (399)	total: 1m 30s	remaining: 17s
841:	learn: 0.3464954	test: 0.4136285	best: 0.4126080 (399)	total: 1m 30s	remaining: 16.9s
842:	learn: 0.3464200	test: 0.4136339	best: 0.4126080 (399)	total: 1m 30s	remaining: 16.8s
843:	learn: 0.3463746	test: 0.4136367	best: 0.4126080 (399)	total: 1m 30s	remaining: 16.7s
8

924:	learn: 0.3408351	test: 0.4140406	best: 0.4126080 (399)	total: 1m 39s	remaining: 8.08s
925:	learn: 0.3408095	test: 0.4140441	best: 0.4126080 (399)	total: 1m 39s	remaining: 7.98s
926:	learn: 0.3407721	test: 0.4140336	best: 0.4126080 (399)	total: 1m 39s	remaining: 7.87s
927:	learn: 0.3406685	test: 0.4140372	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.77s
928:	learn: 0.3406060	test: 0.4140507	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.66s
929:	learn: 0.3404705	test: 0.4140657	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.55s
930:	learn: 0.3404415	test: 0.4140729	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.45s
931:	learn: 0.3403718	test: 0.4140631	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.34s
932:	learn: 0.3403090	test: 0.4140740	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.23s
933:	learn: 0.3402611	test: 0.4140815	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.12s
934:	learn: 0.3402021	test: 0.4140858	best: 0.4126080 (399)	total: 1m 40s	remaining: 7.02s

## 2.2 Model cross validation
Cross validation is even better than validation.

In [11]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Now we have values of our loss functions at each boosting step averaged by 3 folds, which should provide us with a more accurate estimation of our model performance:

In [12]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.82±0.00 on step 606


In [13]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8211799722288867


## 3.1 Model applying
Apply our model to the test set. 

In [14]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 1 0 0 0 0 0 0 0 0]
[[0.71340542 0.28659458]
 [0.32531212 0.67468788]
 [0.83905522 0.16094478]
 [0.61398547 0.38601453]
 [0.84462914 0.15537086]
 [0.98325453 0.01674547]
 [0.6831721  0.3168279 ]
 [0.78903592 0.21096408]
 [0.52455967 0.47544033]
 [0.84789819 0.15210181]]


When we created the model we specified random_seed=42 parameter. By default CatBoost chooses a random value for seed.

In [18]:
model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0


We define parameters and Pool. The Pool stores information about the data set such as features, labels, categorical feature indices, weights and more.

In [19]:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

## 3.1 Using the best model

In [20]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.8211

Best model validation accuracy: 0.8219


# OLD CODE

### Data processing
Convert from strings to best possible data types. 
Swap columns so that target is the final column.

In [4]:
def df_column_switch(df, column1, column2):
    i = list(df.columns)
    a, b = i.index(column1), i.index(column2)
    i[b], i[a] = i[a], i[b]
    df = df[i]
    return df

def get_sets(data):
    nums = data[numerical_columns].fillna(-999).convert_dtypes(False, True, False, False, False)
    cats = data[categorical_columns].fillna("-999")
    return pd.concat([nums,cats], axis=1)

numerical_columns = ["f0", "f3", "f5", "f6", "f7", "f11", "f16","f17","f19","f20","f21","f24", "f27", "f28"]
categorical_columns = ["f1", "f2", "f4", "f8", "f9", "f10", "f12", "f13", "f14", "f15", "f18", "f22", "f23", "f25", "f26"]
labels = numerical_columns + categorical_columns

df_train_2 = get_sets(df_train)
# df_train = df_column_switch(df_train, "target", "f28")
df_test_2 = get_sets(df_test)

print(df_train_2.head())

      f0   f3     f5   f6   f7      f11    f16         f17  f19  f20  ...  \
0    1.0  3.0 -999.0  0.0  6.0  13.9681    2.0    0.858315  0.5  0.0  ...   
1    0.0  1.0    7.0  1.0  4.0  14.0242 -999.0    0.616743  0.4  0.0  ...   
2 -999.0  1.0   10.0  1.0  6.0  14.2174    3.0    0.711389  0.5  1.0  ...   
3    1.0  3.0    7.0  1.0  1.0  13.8536    5.0 -999.000000  0.6  1.0  ...   
4    1.0  3.0    7.0  0.0  6.0  14.2347    2.0    0.572781  0.5  0.0  ...   

   f10        f12  f13        f14        f15 f18   f22        f23 f25 f26  
0    C  1c756c04a    h  168e51823  7861df0a8   F     R  328b0cf4e   T   N  
1    A  5d1ac7760    g  558613041  1d88b0a79   F  -999  328b0cf4e   T   N  
2    H  f14f5e4a5    c  ae14d280b  b72c0bbc2   C     G  0c67fcbbd   T   N  
3    K  56d35c774    c  86ccc2ee4  a5eef5d8b   D     G  fee4e3007   T   N  
4    Y  5622ee17f    a  3162d6e8b  7861df0a8   D     B  587e040bd   T   N  

[5 rows x 29 columns]


In [5]:
array = df_train.values
Y = df_train["target"]
X = array[:, 0:-1]

In [6]:
import numpy as np

from catboost import CatBoostClassifier, Pool

model = CatBoostClassifier(iterations=1000,
                           learning_rate=1,
                           loss_function='Logloss',
                           cat_features=categorical_columns,
                           verbose=True,
                           plot=True)
# train the model
model.fit(df_train_2, Y, cat_features=categorical_columns)
# make the prediction using the resulting model
preds_class = model.predict(df_test_2)
preds_proba = model.predict_proba(df_test_2)
print("class = ", preds_class)
print("proba = ", preds_proba)



0:	learn: 0.4443426	total: 7.31s	remaining: 2h 1m 45s


KeyboardInterrupt: 

## Write predictions to file
Write the predictions made by the model to a csv file on the format (id, target). 

In [16]:
stacked_columns = np.column_stack((df_test["id"], predictions_probs[:, 1]))
df = pd.DataFrame(stacked_columns, columns=["id", "target"])
convert_dict={"id": int, "target" : float}
df = df.astype(convert_dict)
df.to_csv("predictions.csv", index = False)