In [20]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from catboost import CatBoostRegressor

# Displaying pd Dataframe options
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

# Get datas from pickle

In [21]:
%%time
train_df = pd.read_pickle('train_df.pickle')
train_df['date'] = pd.to_datetime(train_df['date'], format='%Y%m%d')
train_df['weekday'] = train_df['date'].dt.weekday_name
train_df['day_off'] = (train_df['date'].dt.dayofweek > 4).astype(int)

CPU times: user 3.05 s, sys: 1.02 s, total: 4.07 s
Wall time: 4.13 s


In [22]:
%%time
test_df = pd.read_pickle('test_df.pickle')
test_df['date'] = pd.to_datetime(test_df['date'], format='%Y%m%d')
test_df['weekday'] = test_df['date'].dt.weekday_name
test_df['day_off'] = (test_df['date'].dt.dayofweek > 4).astype(int)

CPU times: user 2.88 s, sys: 1.94 s, total: 4.82 s
Wall time: 5.61 s


In [23]:
def check_diff_in_dfs():
    print('train_df \ test_df', set(train_df.columns).difference(set(test_df.columns)))
    print('test_df \ train_df', set(test_df.columns).difference(set(train_df.columns)))

In [24]:
# Data revision
check_diff_in_dfs()

train_df \ test_df {'transactionRevenue'}
test_df \ train_df set()


In [25]:
cat_features = ['channelGrouping','isMobile','browser','deviceCategory','operatingSystem',
               'country','region','subContinent','networkDomain','continent','newVisits',
               'medium', 'isTrueDirect', 'source', 'isVideoAd', 'slot','page', 'day_off', 
                'weekday']
num_features = ['visitNumber','pageviews','hits']

In [26]:
shape = train_df.shape[0]
X_train, X_test = train_df.iloc[:int(shape*0.7),:], train_df.iloc[int(shape*0.7):,:]
y_train, y_test = X_train['transactionRevenue'], X_test['transactionRevenue']

X_train, X_test = X_train[cat_features+num_features], X_test[cat_features+num_features]

X_train[num_features] = X_train[num_features].replace('no_key', 0)
X_test[num_features] = X_test[num_features].replace('no_key', 0)

In [27]:
# зачем мы нумеруем categorical_features?
# катбусту надо передавать индексы катфичей
categorical_features=[i for i, x in enumerate(X_train.columns) if x in cat_features]

In [28]:
test_df = test_df[['fullVisitorId'] + cat_features + num_features]
test_df[num_features] = test_df[num_features].replace('no_key',0).fillna(0)
test_df[cat_features] = test_df[cat_features].fillna('other')

In [29]:
model_Cat = CatBoostRegressor(iterations=100, thread_count=4, learning_rate=0.4,
                             depth=10, border_count=100, has_time=True,
                             counter_calc_method='SkipTest')
model_Cat.fit(X_train, y_train, cat_features=categorical_features)

In [30]:
model_Cat.fit(X_train, y_train, cat_features=categorical_features)

0:	learn: 1.7550766	total: 2.42s	remaining: 3m 59s
1:	learn: 1.6758252	total: 5.51s	remaining: 4m 29s
2:	learn: 1.6390744	total: 8.46s	remaining: 4m 33s
3:	learn: 1.6178052	total: 11.1s	remaining: 4m 26s
4:	learn: 1.6068442	total: 13.5s	remaining: 4m 16s
5:	learn: 1.5987508	total: 16.1s	remaining: 4m 12s
6:	learn: 1.5885581	total: 18.7s	remaining: 4m 7s
7:	learn: 1.5842453	total: 21.7s	remaining: 4m 9s
8:	learn: 1.5794651	total: 23.8s	remaining: 4m 1s
9:	learn: 1.5750630	total: 26.1s	remaining: 3m 55s
10:	learn: 1.5717692	total: 28.6s	remaining: 3m 51s
11:	learn: 1.5672126	total: 31.4s	remaining: 3m 50s
12:	learn: 1.5637648	total: 34.2s	remaining: 3m 48s
13:	learn: 1.5591357	total: 36.6s	remaining: 3m 44s
14:	learn: 1.5566387	total: 38.8s	remaining: 3m 39s
15:	learn: 1.5545410	total: 41.4s	remaining: 3m 37s
16:	learn: 1.5523081	total: 43.9s	remaining: 3m 34s
17:	learn: 1.5481094	total: 46.5s	remaining: 3m 32s
18:	learn: 1.5468068	total: 48.8s	remaining: 3m 28s
19:	learn: 1.5437951	tota

<catboost.core.CatBoostRegressor at 0x220b3ffd0>

In [31]:
test_df['prediction'] = model_Cat.predict(test_df.iloc[:,1:])
test_df['prediction'] = test_df['prediction']*(test_df['prediction'] >= 0)
test_df['prediction'] = np.exp(test_df['prediction']) - 1
pred_dict = dict(test_df.groupby('fullVisitorId')['prediction'].sum())

In [32]:
subm = pd.read_csv('sample_submission.csv')
subm['PredictedLogRevenue'] = subm['fullVisitorId'].map(pred_dict).fillna(0)
subm['PredictedLogRevenue'] = np.log(subm['PredictedLogRevenue']+1)
subm.to_csv('fourth_subm.csv', sep=',', index=None)

In [33]:
# Checking features importances
for val, name in zip(model_Cat.feature_importances_, model_Cat.feature_names_):
    print(val, name)

2.1863840348256285 b'channelGrouping'
0.11044686275041643 b'isMobile'
0.7877889046499208 b'browser'
1.4167833286179237 b'deviceCategory'
4.958714505768322 b'operatingSystem'
5.591826169950613 b'country'
6.697803184218588 b'region'
1.3031546579175248 b'subContinent'
0.13718291842470923 b'networkDomain'
0.6490001278806667 b'continent'
1.800898556705775 b'newVisits'
1.1379134157147657 b'medium'
3.2548968293983327 b'isTrueDirect'
6.061298698767857 b'source'
0.2135298218706007 b'isVideoAd'
0.3059332830865738 b'slot'
0.0783520551638093 b'page'
0.6199501674109053 b'day_off'
5.07492212475394 b'weekday'
8.783459345592847 b'visitNumber'
27.65681841756791 b'pageviews'
21.172942588962368 b'hits'
