In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2022-06-28T15:22:58.898659+03:00

Python implementation: CPython
Python version       : 3.7.10
IPython version      : 7.22.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 158 Stepping 9, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [3]:
import os
import pickle as pkl

import numpy as np
import pandas as pd

from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool


In [4]:
from catboost import __version__ as cb_version
from sklearn import __version__ as sklearn_version

print(f'cb_version: {cb_version}')
print(f'sklearn_version: {sklearn_version}')

cb_version: 0.24.3
sklearn_version: 0.24.2


In [5]:
%watermark --iversions

pandas: 0.25.3
numpy : 1.20.3



## Reproducibility block

In [6]:
# seed the RNG for all devices (both CPU and CUDA)
#torch.manual_seed(1984)

#Disabling the benchmarking feature causes cuDNN to deterministically select an algorithm, 
#possibly at the cost of reduced performance.
#torch.backends.cudnn.benchmark = False

# for custom operators,
import random
random.seed(5986721)

# 
np.random.seed(62185)

#sklearn take seed from a line abowe

CB_RANDOMSEED = 309487

In [7]:
DIR_DATA   = os.path.join(os.getcwd(), 'data')
DIR_MODELS = os.path.join(os.getcwd(), 'models')
DIR_SUBM   = os.path.join(os.getcwd(), 'subm')
DIR_SUBM_PART = os.path.join(os.getcwd(), 'subm', 'partial')

## Load data

In [8]:
x_train  = pd.read_csv(os.path.join(DIR_DATA, 'x_train.csv'), index_col= 0)
x_val    = pd.read_csv(os.path.join(DIR_DATA, 'x_val.csv'), index_col= 0)
df_test  = pd.read_csv(os.path.join(DIR_DATA, 'test_upd.csv'), index_col= 0)

with open(os.path.join(DIR_DATA, 'cat_columns.pkl'), 'rb') as pickle_file:
    cat_cols = pkl.load(pickle_file)
    
with open(os.path.join(DIR_DATA, 'num_columns.pkl'), 'rb') as pickle_file:
    num_cols = pkl.load(pickle_file)

In [9]:
x_train.shape, x_val.shape, df_test.shape, len(cat_cols), len(num_cols)

((5592, 14), (1399, 14), (3000, 11), 5, 1)

отделяем метки от данных

In [10]:
y_train = x_train[['views', 'depth', 'full_reads_percent']]
y_val   = x_val[['views', 'depth', 'full_reads_percent']]

x_train.drop(['views', 'depth', 'full_reads_percent'], axis = 1, inplace = True)
x_val.drop(  ['views', 'depth', 'full_reads_percent'], axis = 1, inplace = True)

x_train.shape, x_val.shape, y_train.shape, y_val.shape

((5592, 11), (1399, 11), (5592, 3), (1399, 3))

In [11]:
cat_cols + num_cols

['hour', 'dow', 'weekend', 'day', 'mounth', 'ctr']

In [12]:
#views
train_ds_views = Pool(x_train[cat_cols + num_cols],
                      y_train[['views']],
                      cat_features = cat_cols,
                      feature_names = cat_cols + num_cols
                     )

val_ds_views   = Pool(x_val[cat_cols + num_cols],
                      y_val[['views']],
                      cat_features = cat_cols,
                      feature_names = cat_cols + num_cols
                     )


#depth
train_ds_depth = Pool(x_train[cat_cols + num_cols],
                      y_train[['depth']],
                      cat_features = cat_cols,
                      feature_names = cat_cols + num_cols
                     )

val_ds_depth   = Pool(x_val[cat_cols + num_cols],
                      y_val[['depth']],
                      cat_features = cat_cols,
                      feature_names = cat_cols + num_cols
                     )


#full_reads_percent
train_ds_frp = Pool(x_train[cat_cols + num_cols],
                      y_train[['full_reads_percent']],
                      cat_features = cat_cols,
                      feature_names = cat_cols + num_cols
                     )

val_ds_frp   = Pool(x_val[cat_cols + num_cols],
                      y_val[['full_reads_percent']],
                      cat_features = cat_cols,
                      feature_names = cat_cols + num_cols
                     )

## views

In [13]:
cb_model_views = CatBoostRegressor(iterations=20,
                                   learning_rate=1,
                                   depth=4,
                                   random_seed = CB_RANDOMSEED,
                                  )
# Fit model
cb_model_views.fit(train_ds_views,
        #train_data, train_labels,
                   eval_set=val_ds_views, 
                   #plot = True,
                  )

0:	learn: 75360.1195395	test: 117870.0504397	best: 117870.0504397 (0)	total: 169ms	remaining: 3.21s
1:	learn: 75355.2302093	test: 117841.6199129	best: 117841.6199129 (1)	total: 185ms	remaining: 1.66s
2:	learn: 75113.6961385	test: 117629.6379271	best: 117629.6379271 (2)	total: 196ms	remaining: 1.11s
3:	learn: 74102.7144209	test: 118039.1228816	best: 117629.6379271 (2)	total: 207ms	remaining: 827ms
4:	learn: 73462.8981654	test: 117505.9048526	best: 117505.9048526 (4)	total: 216ms	remaining: 648ms
5:	learn: 72846.4225476	test: 117375.3612864	best: 117375.3612864 (5)	total: 226ms	remaining: 527ms
6:	learn: 72833.7801154	test: 117361.9729046	best: 117361.9729046 (6)	total: 234ms	remaining: 435ms
7:	learn: 71824.6538451	test: 117036.6227086	best: 117036.6227086 (7)	total: 244ms	remaining: 365ms
8:	learn: 68957.5419793	test: 115216.3151507	best: 115216.3151507 (8)	total: 252ms	remaining: 309ms
9:	learn: 64445.6197501	test: 95115.2262314	best: 95115.2262314 (9)	total: 261ms	remaining: 261ms
10

<catboost.core.CatBoostRegressor at 0x228c9aea508>

In [14]:
# Get predictions and metrics
preds_train_views = cb_model_views.predict(x_train[cat_cols + num_cols])
preds_val_views   = cb_model_views.predict(val_ds_views)

train_score_views = r2_score(y_train["views"], preds_train_views)
val_score_views   = r2_score(y_val["views"],   preds_val_views)

train_score_views, val_score_views

(0.5100195781796532, 0.5065575808145328)

## depth

In [15]:
cb_model_depth = CatBoostRegressor(iterations=20,
                                   learning_rate=1,
                                   depth=4,
                                   random_seed = CB_RANDOMSEED,
                                  )
# Fit model
cb_model_depth.fit(train_ds_depth,
                   eval_set=val_ds_depth, 
                   #plot = True,
                  )

0:	learn: 0.0488160	test: 0.0547476	best: 0.0547476 (0)	total: 25.1ms	remaining: 477ms
1:	learn: 0.0452337	test: 0.0498291	best: 0.0498291 (1)	total: 39.5ms	remaining: 355ms
2:	learn: 0.0441018	test: 0.0494809	best: 0.0494809 (2)	total: 52ms	remaining: 294ms
3:	learn: 0.0434015	test: 0.0489231	best: 0.0489231 (3)	total: 60.7ms	remaining: 243ms
4:	learn: 0.0430400	test: 0.0486279	best: 0.0486279 (4)	total: 69.3ms	remaining: 208ms
5:	learn: 0.0426768	test: 0.0487880	best: 0.0486279 (4)	total: 77.5ms	remaining: 181ms
6:	learn: 0.0426442	test: 0.0487599	best: 0.0486279 (4)	total: 86.2ms	remaining: 160ms
7:	learn: 0.0426415	test: 0.0487663	best: 0.0486279 (4)	total: 96.1ms	remaining: 144ms
8:	learn: 0.0423437	test: 0.0485966	best: 0.0485966 (8)	total: 105ms	remaining: 128ms
9:	learn: 0.0411557	test: 0.0485407	best: 0.0485407 (9)	total: 114ms	remaining: 114ms
10:	learn: 0.0401989	test: 0.0484133	best: 0.0484133 (10)	total: 123ms	remaining: 101ms
11:	learn: 0.0395377	test: 0.0476966	best: 0.0

<catboost.core.CatBoostRegressor at 0x228c9b1be08>

In [16]:
# Get predictions and metrics
preds_train_depth = cb_model_depth.predict(x_train[cat_cols + num_cols])
preds_val_depth   = cb_model_depth.predict(val_ds_depth)

train_score_depth = r2_score(y_train["depth"], preds_train_depth)
val_score_depth   = r2_score(y_val["depth"],   preds_val_depth)

train_score_depth, val_score_depth

(0.6426108706670193, 0.5285762915493839)

## full_reads_percent

In [17]:
cb_model_frp = CatBoostRegressor(iterations=20,
                                 learning_rate=1,
                                 depth=4,
                                 random_seed = CB_RANDOMSEED,
                                )
# Fit model
cb_model_frp.fit(train_ds_frp,
                   eval_set=val_ds_frp, 
                   #plot = True,
                  )

0:	learn: 9.9603569	test: 9.2653195	best: 9.2653195 (0)	total: 26.2ms	remaining: 499ms
1:	learn: 9.8567182	test: 9.1838367	best: 9.1838367 (1)	total: 50.2ms	remaining: 452ms
2:	learn: 9.7876705	test: 9.1427622	best: 9.1427622 (2)	total: 74ms	remaining: 419ms
3:	learn: 9.7453748	test: 9.1228918	best: 9.1228918 (3)	total: 86.9ms	remaining: 348ms
4:	learn: 9.7086656	test: 9.1299693	best: 9.1228918 (3)	total: 98.1ms	remaining: 294ms
5:	learn: 9.6881707	test: 9.1287665	best: 9.1228918 (3)	total: 106ms	remaining: 246ms
6:	learn: 9.6847868	test: 9.1285411	best: 9.1228918 (3)	total: 114ms	remaining: 212ms
7:	learn: 9.6772275	test: 9.1294206	best: 9.1228918 (3)	total: 123ms	remaining: 185ms
8:	learn: 9.6621382	test: 9.1170918	best: 9.1170918 (8)	total: 132ms	remaining: 162ms
9:	learn: 9.6417475	test: 9.1322980	best: 9.1170918 (8)	total: 142ms	remaining: 142ms
10:	learn: 9.6317605	test: 9.1334745	best: 9.1170918 (8)	total: 150ms	remaining: 123ms
11:	learn: 9.5803715	test: 9.1405149	best: 9.11709

<catboost.core.CatBoostRegressor at 0x228c9b20688>

In [18]:
# Get predictions and metrics
preds_train_frp = cb_model_frp.predict(x_train[cat_cols + num_cols])
preds_val_frp  = cb_model_frp.predict(val_ds_frp)

train_score_frp = r2_score(y_train["full_reads_percent"], preds_train_frp)
val_score_frp  = r2_score(y_val["full_reads_percent"],   preds_val_frp)

train_score_frp, val_score_frp

(0.23313112177548145, 0.2305089887333156)

In [19]:
score_train = 0.4 * train_score_views + 0.3 * train_score_depth + 0.3 * train_score_frp
score_val  = 0.4 * val_score_views  + 0.3 * val_score_depth  + 0.3 * val_score_frp

score_train, score_val

(0.46673042900461154, 0.43034861641062294)

## save models

In [20]:
cb_model_views.save_model(os.path.join(DIR_MODELS, 'cb_views.cbm'), 
                           format="cbm",
                           export_parameters=None,
                           pool=None
                         )

cb_model_depth.save_model(os.path.join(DIR_MODELS, 'cb_depth.cbm'), 
                           format="cbm",
                           export_parameters=None,
                           pool=None
                         )

cb_model_frp.save_model(os.path.join(DIR_MODELS, 'cb_frp.cbm'), 
                           format="cbm",
                           export_parameters=None,
                           pool=None
                         )

## make predict

In [21]:
pred_views = cb_model_views.predict(df_test[cat_cols + num_cols])
pred_depth = cb_model_depth.predict(df_test[cat_cols + num_cols])
pred_frp   = cb_model_frp.predict(  df_test[cat_cols + num_cols])

In [22]:
subm = pd.DataFrame()
subm['document_id'] = df_test.index

subm['views'] = pred_views
subm['depth'] = pred_depth
subm['full_reads_percent'] = pred_frp

In [23]:
subm.head()

Unnamed: 0,document_id,views,depth,full_reads_percent
0,61f9569a9a794794245a82abJ0AvX96vTAaQCiWVbzoMdw,16926.122021,1.176089,36.264629
1,628c22b89a79470e553f594bQS5CqzXYRnmDdR2LaSreEw,7331.857768,1.063776,32.743696
2,627cb3249a7947ebdd752865XVsoyrUOT8OJJg2_finJhw,8209.656081,1.06623,30.633026
3,628618629a7947d4927eb812upfii3whSSuMXCqcqF8VbQ,7348.966911,1.057085,37.624757
4,620e76109a7947235623695b5hzCiIHdSYKQIr8WAM18bw,160489.198657,1.164103,31.792748


In [24]:
subm.to_csv(os.path.join(DIR_SUBM, '1_cb_baseline_test.csv'), index = False)