In [53]:
import os
from glob import glob
import pandas as pd
import numpy as np
import itertools
from collections import Counter


import pickle
import gc

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [2]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, make_scorer

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

from catboost import CatBoostClassifier

In [4]:
DATA = './data'
DATA_OWN = './data_own'
CLICKSTREAM = 'alfabattle2_abattle_clickstream'
MODELS = './models'
UTILS = './utils'

## prepare data, test / training sets

In [5]:
data = pd.read_csv(os.path.join(DATA, 'alfabattle2_abattle_train_target.csv'), parse_dates=['timestamp'])
data.head()

Unnamed: 0,session_id,client_pin,timestamp,multi_class_target
0,0000029e72e5fcde6a9f29c3a3ed198f,7cf9221322a0e2fdefb1b998b8f2ab29,2020-06-15 14:01:12,main_screen
1,00063dffa47b0fe5556b2b9e8beddb6a,5f16c0ab27a806fd08db3122921adf3a,2020-03-21 12:59:34,invest
2,0007857f36d268ec46fcb7305538a1c7,ec868fc2b388293cf10e18ee9518d72f,2020-01-24 18:18:55,statement
3,000a7f25fc3609cdfda54c5f059aab00,91f55a33d7502c1a1fa5da7ff2f7b648,2020-03-15 19:50:23,main_screen
4,000b746d6616669663feaa1474ac97f1,3ef1020bda95ce7836d2680fa553ecb7,2020-02-20 07:56:58,main_screen


In [6]:
def get_time_of_day(inp_hour):
    if (inp_hour >= 12) and (inp_hour < 18):
        return 'day'
    elif (inp_hour >= 6) and (inp_hour < 12):
        return 'morning'
    elif (inp_hour >= 18) and (inp_hour <= 23):
        return 'evening'
    else:
        return 'night'

In [7]:
%%time
data['dom']  = data.timestamp.apply(lambda x: x.day)
data['dow']  = data.timestamp.apply(lambda x: x.weekday())
data['hour'] = data.timestamp.apply(lambda x: x.hour)
data['tod']  = data.hour.apply(get_time_of_day)

data.head()

Wall time: 49.7 s


Unnamed: 0,session_id,client_pin,timestamp,multi_class_target,dom,dow,hour,tod
0,0000029e72e5fcde6a9f29c3a3ed198f,7cf9221322a0e2fdefb1b998b8f2ab29,2020-06-15 14:01:12,main_screen,15,0,14,day
1,00063dffa47b0fe5556b2b9e8beddb6a,5f16c0ab27a806fd08db3122921adf3a,2020-03-21 12:59:34,invest,21,5,12,day
2,0007857f36d268ec46fcb7305538a1c7,ec868fc2b388293cf10e18ee9518d72f,2020-01-24 18:18:55,statement,24,4,18,evening
3,000a7f25fc3609cdfda54c5f059aab00,91f55a33d7502c1a1fa5da7ff2f7b648,2020-03-15 19:50:23,main_screen,15,6,19,evening
4,000b746d6616669663feaa1474ac97f1,3ef1020bda95ce7836d2680fa553ecb7,2020-02-20 07:56:58,main_screen,20,3,7,morning


In [8]:
data.keys()[4:]

Index(['dom', 'dow', 'hour', 'tod'], dtype='object')

In [9]:
target = data['multi_class_target']

In [10]:
lb_dom  = LabelBinarizer().fit(data['dom'])
lb_dow  = LabelBinarizer().fit(data['dow'])
lb_hour = LabelBinarizer().fit(data['hour'])
lb_tod  = LabelBinarizer().fit(data['tod'])

In [11]:
dom_features  = ['dom_'  + str(el) for el in lb_dom.classes_]
dow_features  = ['dow_'  + str(el) for el in lb_dow.classes_]
hour_features = ['hour_' + str(el) for el in lb_hour.classes_]
tod_features  = ['tod_'  + str(el) for el in lb_tod.classes_]

In [12]:
%%time
dom_prep  = lb_dom.transform(data['dom'])
dow_prep  = lb_dow.transform(data['dow'])
hour_prep = lb_hour.transform(data['hour'])
tod_prep =  lb_tod.transform(data['tod'])

dom_prep.shape, dow_prep.shape, hour_prep.shape, tod_prep.shape

Wall time: 9.37 s


((5065350, 31), (5065350, 7), (5065350, 24), (5065350, 4))

saving LabelBinarizer for using in prepare to submit part

In [13]:
pickle.dump(lb_dom,  open((os.path.join(UTILS, 'lb_dom.pkl')),  'wb'))
pickle.dump(lb_dow,  open((os.path.join(UTILS, 'lb_dow.pkl')),  'wb'))
pickle.dump(lb_hour, open((os.path.join(UTILS, 'lb_hour.pkl')), 'wb'))
pickle.dump(lb_tod,  open((os.path.join(UTILS, 'lb_tod.pkl')),  'wb'))

merge current data with statistics

In [16]:
client_freq_targ = pd.read_csv(os.path.join(DATA_OWN, 'client_freq.csv'))

In [17]:
client_freq_targ.keys()

Index(['client_pin', 'client_freq_main_screen', 'client_freq_statement',
       'client_freq_credit_info', 'client_freq_own_transfer',
       'client_freq_mobile_recharge', 'client_freq_phone_money_transfer',
       'client_freq_card2card_transfer', 'client_freq_chat',
       'client_freq_card_recharge', 'client_freq_invest'],
      dtype='object')

In [18]:
data = data.merge(client_freq_targ, how= 'left', on='client_pin', validate='many_to_one')

In [19]:
client_freq_features = ['client_freq_main_screen', 'client_freq_statement',
       'client_freq_credit_info', 'client_freq_own_transfer',
       'client_freq_mobile_recharge', 'client_freq_phone_money_transfer',
       'client_freq_card2card_transfer', 'client_freq_chat',
       'client_freq_card_recharge', 'client_freq_invest']

In [20]:
glob_freq_dom  = pd.read_csv(os.path.join(DATA_OWN, 'glob_freq_dom.csv'))
glob_freq_dow  = pd.read_csv(os.path.join(DATA_OWN, 'glob_freq_dow.csv'))
glob_freq_hour = pd.read_csv(os.path.join(DATA_OWN, 'glob_freq_hour.csv'))
glob_freq_tod  = pd.read_csv(os.path.join(DATA_OWN, 'glob_freq_tod.csv'))

In [21]:
glob_freq_dom.columns  = ['dom_'  + el for el in glob_freq_dom.keys()]
glob_freq_dow.columns  = ['dow_'  + el for el in glob_freq_dow.keys()]
glob_freq_hour.columns = ['hour_' + el for el in glob_freq_hour.keys()]
glob_freq_tod.columns  = ['tod_'  + el for el in glob_freq_tod.keys()]

In [22]:
glob_freq_dom = glob_freq_dom.rename(  columns={'dom_dom': 'dom'})
glob_freq_dow = glob_freq_dow.rename(  columns={'dow_dow': 'dow'})
glob_freq_hour = glob_freq_hour.rename(columns={'hour_hour': 'hour'})
glob_freq_tod = glob_freq_tod.rename(  columns={'tod_tod': 'tod'})

#glob_freq_dom.keys(), glob_freq_dow.keys(), glob_freq_hour.keys(), glob_freq_tod.keys(), 

In [23]:
dom_freq_features  = [el for el in glob_freq_dom.keys()[1:]]
dow_freq_features  = [el for el in glob_freq_dow.keys()[1:]]
hour_freq_features = [el for el in glob_freq_hour.keys()[1:]]
tod_freq_features  = [el for el in glob_freq_tod.keys()[1:]]

In [24]:
data.shape

(5065350, 18)

In [25]:
data = data.merge(glob_freq_dom, how= 'left', on='dom', validate='many_to_one')
data.shape

(5065350, 28)

In [26]:
data = data.merge(glob_freq_dow, how= 'left', on='dow', validate='many_to_one')
data.shape

(5065350, 38)

In [27]:
data = data.merge(glob_freq_hour, how= 'left', on='hour', validate='many_to_one')
data.shape

(5065350, 48)

In [28]:
data = data.merge(glob_freq_tod, how= 'left', on='tod', validate='many_to_one')
data.shape

(5065350, 58)

In [29]:
data.sample(10)

Unnamed: 0,session_id,client_pin,timestamp,multi_class_target,dom,dow,hour,tod,client_freq_main_screen,client_freq_statement,...,tod_glob_freq_main_screen,tod_glob_freq_statement,tod_glob_freq_credit_info,tod_glob_freq_own_transfer,tod_glob_freq_mobile_recharge,tod_glob_freq_phone_money_transfer,tod_glob_freq_card2card_transfer,tod_glob_freq_chat,tod_glob_freq_card_recharge,tod_glob_freq_invest
4639884,022a5636b99364b6b82010db16884d92,2d1789bdcb273a1b0e19c75457f5f5dc,2020-04-29 14:06:58,statement,29,2,14,day,0.338983,0.491525,...,0.456649,0.185174,0.08976,0.060708,0.042505,0.04904,0.040268,0.037288,0.028409,0.010199
1855757,969947985d789f9ff49415f14ce7495d,b25e92c3807272256095a21eadf90684,2020-06-10 10:01:27,statement,10,2,10,morning,0.627586,0.241379,...,0.45732,0.166099,0.103246,0.054353,0.066289,0.0451,0.034454,0.035082,0.026971,0.011084
365120,09001d08bce55fe5d004b51aff71f5d1,5f313cde19c6b6d6feaf9c6d2e34f7cb,2020-08-28 19:01:25,statement,28,4,19,evening,0.409091,0.454545,...,0.431842,0.19867,0.10001,0.0587,0.046417,0.045566,0.041307,0.037583,0.027151,0.012754
1998252,97e872c18e1e54517cca5ddf1c4f81b2,485f922edf0024c5bec8e93c334fa448,2020-04-30 22:25:10,statement,30,3,22,evening,0.573529,0.264706,...,0.431842,0.19867,0.10001,0.0587,0.046417,0.045566,0.041307,0.037583,0.027151,0.012754
3833994,384e677440241e071a82fb2e22b7d225,261a8d0f727b3053cb389a12768ef81f,2020-07-15 17:25:12,statement,15,2,17,day,0.016393,0.918033,...,0.456649,0.185174,0.08976,0.060708,0.042505,0.04904,0.040268,0.037288,0.028409,0.010199
3251594,8188aa2d7fa233f0b2c0516d8dea98fb,e6e59ab3e9d1d56550ab70868ce6dbdc,2020-06-20 13:01:53,card2card_transfer,20,5,13,day,0.564103,0.196581,...,0.456649,0.185174,0.08976,0.060708,0.042505,0.04904,0.040268,0.037288,0.028409,0.010199
4587189,679355584cbc5418c46d8fa1fb2f589a,b798c3856103839531151ca8e75b911f,2020-07-27 14:15:46,own_transfer,27,0,14,day,0.066667,0.023529,...,0.456649,0.185174,0.08976,0.060708,0.042505,0.04904,0.040268,0.037288,0.028409,0.010199
3092965,ab33aceb6b541260bac6b8e4dae156cf,26e753203b620f92cba8f7018a3ad7c6,2020-05-11 14:43:21,statement,11,0,14,day,0.619469,0.265487,...,0.456649,0.185174,0.08976,0.060708,0.042505,0.04904,0.040268,0.037288,0.028409,0.010199
2236967,6294235b73a8bd35c4081d1dc538faba,1bea56794952efd000a707d212c0d230,2020-05-20 03:59:18,main_screen,20,2,3,night,0.796296,0.046296,...,0.447566,0.177681,0.120468,0.045952,0.070826,0.034378,0.03195,0.034236,0.024046,0.012897
4148994,c5109868cf192f424af686c2c93875e9,c2a588aeb447d5dce476373d83d8416e,2020-01-31 18:21:18,phone_money_transfer,31,4,18,evening,0.274336,0.221239,...,0.431842,0.19867,0.10001,0.0587,0.046417,0.045566,0.041307,0.037583,0.027151,0.012754


In [30]:
%%time
X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.33, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

Wall time: 8.89 s


((3393784, 58), (1671566, 58), (3393784,), (1671566,))

In [48]:
using_features = \
                 client_freq_features + \
                 dom_freq_features + \
                 dow_freq_features + \
                 hour_freq_features + \
                 tod_freq_features 


                 # dates as a features \
                 #dom_features + \
                 #dow_features + \
                 #hour_features + \
                 #tod_features 
                 #
print(len(using_features), using_features)

50 ['client_freq_main_screen', 'client_freq_statement', 'client_freq_credit_info', 'client_freq_own_transfer', 'client_freq_mobile_recharge', 'client_freq_phone_money_transfer', 'client_freq_card2card_transfer', 'client_freq_chat', 'client_freq_card_recharge', 'client_freq_invest', 'dom_glob_freq_main_screen', 'dom_glob_freq_statement', 'dom_glob_freq_credit_info', 'dom_glob_freq_own_transfer', 'dom_glob_freq_mobile_recharge', 'dom_glob_freq_phone_money_transfer', 'dom_glob_freq_card2card_transfer', 'dom_glob_freq_chat', 'dom_glob_freq_card_recharge', 'dom_glob_freq_invest', 'dow_glob_freq_main_screen', 'dow_glob_freq_statement', 'dow_glob_freq_credit_info', 'dow_glob_freq_own_transfer', 'dow_glob_freq_mobile_recharge', 'dow_glob_freq_phone_money_transfer', 'dow_glob_freq_card2card_transfer', 'dow_glob_freq_chat', 'dow_glob_freq_card_recharge', 'dow_glob_freq_invest', 'hour_glob_freq_main_screen', 'hour_glob_freq_statement', 'hour_glob_freq_credit_info', 'hour_glob_freq_own_transfer', 

In [32]:
#f1 = make_scorer(f1_score , average='macro')

## check classifiers with check time of work

In [None]:
#clf_sgd_log   = SGDClassifier(loss = 'log', class_weight='balanced', n_jobs=-1)

#clf_knn  = KNeighborsClassifier()
#clf_svc  = SVC()
clf_gaus = GaussianNB()

In [33]:
%%time
#clf_sgd_hinge = SGDClassifier(loss = 'hinge', class_weight='balanced', n_jobs=-1)
clf_sgd_hinge = SGDClassifier(loss = 'hinge', n_jobs=-1)
clf_sgd_hinge.fit(X_train[using_features], y_train)
pred_sgd_hinge = clf_sgd_hinge.predict(X_val[using_features])
print(len(set(pred_sgd_hinge)), set(pred_sgd_hinge))
print(f1_score(y_val, pred_sgd_hinge, average  = 'micro'))

{'phone_money_transfer', 'statement', 'card2card_transfer', 'mobile_recharge', 'own_transfer', 'main_screen', 'credit_info', 'chat', 'invest', 'card_recharge'}
0.5865266462706229
Wall time: 45.1 s


In [34]:
%%time
#clf_sgd_perc  = SGDClassifier(loss = 'perceptron', class_weight='balanced', n_jobs=-1)
clf_sgd_perc  = SGDClassifier(loss = 'perceptron', n_jobs=-1)
clf_sgd_perc.fit(X_train[using_features], y_train)
pred_sgd_perc = clf_sgd_perc.predict(X_val[using_features])
print(set(pred_sgd_perc))
print(f1_score(y_val, pred_sgd_perc, average  = 'micro'))

{'phone_money_transfer', 'statement', 'card2card_transfer', 'mobile_recharge', 'own_transfer', 'credit_info', 'chat', 'invest'}
0.25540122256614456
Wall time: 45 s


In [36]:
%%time
# 9 min

clf_mlp  = MLPClassifier((200, 50), learning_rate = 'adaptive', activation='logistic', verbose = True)
clf_mlp.fit(X_train[using_features], y_train)
pred_mlp = clf_mlp.predict(X_val[using_features])
print(len(set(pred_mlp)), set(pred_mlp))
print(f1_score(y_val, pred_mlp, average  = 'micro'))

{'phone_money_transfer', 'statement', 'card2card_transfer', 'mobile_recharge', 'own_transfer', 'main_screen', 'credit_info', 'chat', 'invest', 'card_recharge'}
0.5953267774051398
Wall time: 45min 7s


In [None]:
%%time
# TOOO LONG

#clf_knn.fit(X_train[using_features], y_train)
#pred_knn = clf_knn.predict(X_val[using_features])
#print(set(pred_knn))
#print(f1_score(y_val, pred_knn, average  = 'micro'))

In [None]:
%%time
# TOOO LONG

#clf_svc.fit(X_train[using_features], y_train)
#pred_svc = clf_svc.predict(X_val[using_features])
#print(set(pred_svc))
#print(f1_score(y_val, pred_svc, average  = 'micro'))

In [39]:
%%time
# 24 min
clf_rf = RandomForestClassifier(n_jobs = -1, verbose = 1) # bootstrap = False? oob_score=True
clf_rf.fit(X_train[using_features], y_train)
pred_rf = clf_rf.predict(X_val[using_features])
print(set(pred_rf))
print(f1_score(y_val, pred_rf, average  = 'micro'))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 15.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed: 18.3min
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed: 124.5min finished


{'phone_money_transfer', 'statement', 'card2card_transfer', 'mobile_recharge', 'own_transfer', 'main_screen', 'credit_info', 'chat', 'invest', 'card_recharge'}
0.5894017944849321
Wall time: 2h 21min 5s


In [51]:
print(len(set(pred_rf)), set(pred_rf))

10 {'phone_money_transfer', 'statement', 'card2card_transfer', 'mobile_recharge', 'own_transfer', 'main_screen', 'credit_info', 'chat', 'invest', 'card_recharge'}


In [None]:
%%time

clf_ab   = AdaBoostClassifier()
clf_ab.fit(X_train[using_features], y_train)
pred_ab = clf_ab.predict(X_val[using_features])
print(set(pred_ab))
print(f1_score(y_val, pred_ab, average  = 'micro'))

In [None]:
%%time
clf_gaus.fit(X_train[using_features], y_train)
pred_gaus = clf_gaus.predict(X_val[using_features])
print(set(pred_gaus))
print(f1_score(y_val, pred_gaus, average  = 'micro'))

In [38]:
%%time
clf_cb = CatBoostClassifier()
clf_cb.fit(X_train[using_features], y_train)
pred_cb = clf_cb.predict(X_val[using_features])
#print(set(pred_cb))
#print(f1_score(y_val, pred_cb, average  = 'micro'))

Learning rate set to 0.119781
0:	learn: 1.9588574	total: 11.9s	remaining: 3h 18m 51s
1:	learn: 1.7971999	total: 25.1s	remaining: 3h 28m 47s
2:	learn: 1.6874413	total: 36.6s	remaining: 3h 22m 27s
3:	learn: 1.6056276	total: 48.1s	remaining: 3h 19m 34s
4:	learn: 1.5371976	total: 59.3s	remaining: 3h 16m 37s
5:	learn: 1.4891415	total: 1m 11s	remaining: 3h 16m 6s
6:	learn: 1.4510192	total: 1m 23s	remaining: 3h 16m 38s
7:	learn: 1.4127310	total: 1m 33s	remaining: 3h 13m 15s
8:	learn: 1.3826320	total: 1m 44s	remaining: 3h 12m 20s
9:	learn: 1.3573720	total: 1m 57s	remaining: 3h 14m 30s
10:	learn: 1.3362337	total: 2m 9s	remaining: 3h 13m 38s
11:	learn: 1.3146916	total: 2m 19s	remaining: 3h 12m 3s
12:	learn: 1.2972188	total: 2m 30s	remaining: 3h 10m 44s
13:	learn: 1.2825936	total: 2m 43s	remaining: 3h 11m 53s
14:	learn: 1.2700693	total: 2m 55s	remaining: 3h 11m 34s
15:	learn: 1.2551843	total: 3m 5s	remaining: 3h 9m 51s
16:	learn: 1.2440631	total: 3m 16s	remaining: 3h 9m 3s
17:	learn: 1.2344522	to

In [85]:
print(len(set([el[0] for el in pred_cb])), set([el[0] for el in pred_cb]))

10 {'phone_money_transfer', 'statement', 'card2card_transfer', 'mobile_recharge', 'own_transfer', 'main_screen', 'credit_info', 'chat', 'invest', 'card_recharge'}


In [46]:
print(f1_score(y_val, pred_cb, average  = 'micro'))

0.5975994965200297


In [83]:
%%time
pred_all = ['']*len(pred_sgd_hinge)
for idx in range(pred_sgd_hinge.shape[0]):
#for idx in range(10):
    ctr = Counter([
                   pred_sgd_hinge[idx],
                   pred_cb[idx][0],
                   pred_rf[idx], 
                   #pred_mlp[idx], 
                   ]
                   ).most_common()[0][0]
    pred_all[idx] = ctr
    
print(f1_score(y_val, pred_all, average  = 'micro'))

0.5964598466348322
Wall time: 20.5 s


In [None]:
no sgd: 0.5976880362486435
no cb:  0.5952585778844509
no rf:  0.5956223086614588
no mpl: 0.5964598466348322

## classifiers GridSearch

## study best models on full data

In [40]:
clf_sgd = SGDClassifier(loss = 'hinge', n_jobs=-1)
#clf_sgd_log   = SGDClassifier(loss = 'log', n_jobs=-1)
#clf_sgd  = SGDClassifier(loss = 'perceptron', n_jobs=-1)

clf_mlp  = MLPClassifier((200, 50), learning_rate = 'adaptive', activation='logistic', verbose = True)
#clf_knn  = KNeighborsClassifier()
#clf_svc  = SVC()
clf_rf   = RandomForestClassifier(n_jobs = -1, verbose = 1) 
#clf_ab   = AdaBoostClassifier()
#clf_gaus = GaussianNB()

clf_cb = CatBoostClassifier()

In [41]:
%%time
clf_sgd.fit(data[using_features], target)

Wall time: 2min 17s


SGDClassifier(n_jobs=-1)

In [42]:
%%time
clf_mlp.fit(data[using_features], target)

Iteration 1, loss = 1.16184195
Iteration 2, loss = 1.11170062
Iteration 3, loss = 1.10663095
Iteration 4, loss = 1.10417430
Iteration 5, loss = 1.10272439
Iteration 6, loss = 1.10172334
Iteration 7, loss = 1.10083137
Iteration 8, loss = 1.10001260
Iteration 9, loss = 1.09933318
Iteration 10, loss = 1.09880025
Iteration 11, loss = 1.09841038
Iteration 12, loss = 1.09814063
Iteration 13, loss = 1.09786373
Iteration 14, loss = 1.09767787
Iteration 15, loss = 1.09747618
Iteration 16, loss = 1.09729755
Iteration 17, loss = 1.09712434
Iteration 18, loss = 1.09697235
Iteration 19, loss = 1.09683272
Iteration 20, loss = 1.09669854
Iteration 21, loss = 1.09654951
Iteration 22, loss = 1.09647473
Iteration 23, loss = 1.09629435
Iteration 24, loss = 1.09613207
Iteration 25, loss = 1.09605382
Iteration 26, loss = 1.09596864
Iteration 27, loss = 1.09579813
Iteration 28, loss = 1.09568076
Iteration 29, loss = 1.09557315
Iteration 30, loss = 1.09544483
Iteration 31, loss = 1.09535310
Iteration 32, los

MLPClassifier(activation='logistic', hidden_layer_sizes=(200, 50),
              learning_rate='adaptive', verbose=True)

In [43]:
%%time
clf_rf.fit(data[using_features], target)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 34.2min finished


Wall time: 34min 47s


RandomForestClassifier(n_jobs=-1, verbose=1)

In [44]:
%%time
clf_cb.fit(data[using_features], target)

Learning rate set to 0.122253
0:	learn: 1.9512374	total: 1m 21s	remaining: 22h 31m 21s
1:	learn: 1.7863806	total: 1m 41s	remaining: 14h 6m 20s
2:	learn: 1.6778257	total: 1m 56s	remaining: 10h 46m 34s
3:	learn: 1.5976393	total: 2m 10s	remaining: 9h 1m 42s
4:	learn: 1.5300364	total: 2m 23s	remaining: 7h 56m 29s
5:	learn: 1.4840324	total: 2m 36s	remaining: 7h 11m 59s
6:	learn: 1.4414385	total: 2m 49s	remaining: 6h 41m 39s
7:	learn: 1.4068552	total: 3m 2s	remaining: 6h 17m 58s
8:	learn: 1.3771998	total: 3m 15s	remaining: 5h 59m 34s
9:	learn: 1.3491504	total: 3m 29s	remaining: 5h 45m 43s
10:	learn: 1.3281938	total: 3m 42s	remaining: 5h 32m 55s
11:	learn: 1.3062368	total: 3m 55s	remaining: 5h 22m 34s
12:	learn: 1.2942012	total: 4m 7s	remaining: 5h 13m 20s
13:	learn: 1.2772428	total: 4m 21s	remaining: 5h 7m 11s
14:	learn: 1.2620914	total: 4m 34s	remaining: 5h 54s
15:	learn: 1.2482517	total: 4m 49s	remaining: 4h 56m 37s
16:	learn: 1.2366767	total: 5m 2s	remaining: 4h 51m 56s
17:	learn: 1.22688

<catboost.core.CatBoostClassifier at 0x2c760097608>

## save models

In [47]:
%%time

pickle.dump(clf_sgd, open(os.path.join(MODELS, 'clf_sgd.pkl'), 'wb'))
pickle.dump(clf_mlp, open(os.path.join(MODELS, 'clf_mlp.pkl'), 'wb'))
#pickle.dump(clf_knn, open(os.path.join(MODELS, 'clf_knn.pkl'), 'wb'))
#pickle.dump(clf_svc, open(os.path.join(MODELS, 'clf_svc.pkl'), 'wb'))
#pickle.dump(clf_rf,  open(os.path.join(MODELS, 'clf_rf.pkl'),  'wb'))
#pickle.dump(clf_ab,  open(os.path.join(MODELS, 'clf_ab.pkl'),  'wb'))
#pickle.dump(clf_gaus,open(os.path.join(MODELS, 'clf_gaus.pkl'),'wb'))
            
clf_cb.save_model(os.path.join(MODELS, 'clf_cb.cbm'), format='cbm')

Wall time: 5.66 s
Parser   : 102 ms
