In [1]:
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import datetime as dt
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_column', None)
tqdm.pandas()

In [5]:
cat_features = ['emploee_position', 'SOURCE_SYSTEM', 'OPERATION', 'CHANNEL_GR', 'urf_code_uni', 'PRODUCT_NAME', 'day_of_week']

num_features = ['date_report', 'emp_age', 'report-start_job', 'finish_job-report', 'QUANTITY', 'vl_up', 'insert-report', 'update-report',
                'client_age','client_is_emploee', 'emploee_in_vacation', 'emp_oper_per_day', 'emp_oper_per_month', 'client_product_per_day',
                'client_product_at_moment', 'empcl_amt', 'empcl_diff_dates_before', 'empcl_diff_dates_after',
                'clemp_amt', 'clemp_diff_dates_before', 'clemp_diff_dates_after']

# Загрузка и добавление новых признаков

In [None]:
data = pd.read_csv('dataset_1.csv', index_col=0, thousands=',')
clemp = pd.read_csv('clemp_data.csv', index_col=0)
empcl = pd.read_csv('empcl_data.csv', index_col=0)
prod = pd.read_csv('product_at_moment.csv')

In [None]:
prod = prod.drop('Unnamed: 0', axis=1).drop_duplicates()

In [None]:
prod

In [None]:
empcl

In [None]:
clemp

In [None]:
data = data.drop(['DESCRIPTION', 'IN_MOTIV'], axis=1).drop_duplicates()

In [None]:
data = data.merge(prod[['ID_Operation_History', 'client_product_at_moment']], on='ID_Operation_History', how='left').\
        merge(empcl, on='ID_Operation_History', how='left').merge(clemp, on='ID_Operation_History', how='left')

Придётся убрать  'urf_code_uni' из категориальных признаков, так как во всех банках значений будет больше

# Подготовка новых признаков и исправление старых

In [None]:
data.info(null_counts=True)

In [None]:
mean_emp_age = data[data['emp_age'] > 0]['emp_age'].mean()
mean_client_age = data[data['client_age'].notnull()]['client_age'].mean()

In [None]:
data.loc[data['emp_age'] <= 0, ['emp_age']] = mean_emp_age
data['client_age'] = data['client_age'].fillna(mean_client_age)
data['target'] = np.where(data['target'] != 0, 1, 0)

In [None]:
data['emp_oper_per_day'] = data['emp_oper_per_day'].fillna(0)
data['emp_oper_per_month'] = data['emp_oper_per_month'].fillna(0)
data['client_product_at_moment'] = data['client_product_at_moment'].fillna(0)
data['empcl_amt'] = data['empcl_amt'].fillna(0)
data['empcl_diff_dates_before'] =  pd.to_timedelta(data['empcl_diff_dates_before'].fillna(dt.timedelta(0))).dt.days
data['empcl_diff_dates_after'] = pd.to_timedelta(data['empcl_diff_dates_after'].fillna(dt.timedelta(0))).dt.days
data['clemp_amt'] = data['clemp_amt'].fillna(0)
data['clemp_diff_dates_before'] = pd.to_timedelta(data['clemp_diff_dates_before'].fillna(dt.timedelta(0))).dt.days
data['clemp_diff_dates_after'] = pd.to_timedelta(data['clemp_diff_dates_after'].fillna(dt.timedelta(0))).dt.days

In [None]:
data.columns

In [54]:
data.to_csv('dataset_2.csv')

# Tunning hyperparameters

In [2]:
data = pd.read_csv('dataset_2.csv', index_col=0)

In [3]:
X = data.drop(['target', 'ID_Operation_History'], axis=1) # 'urf_code_uni'
y = data['target']

In [4]:
X.head(0)

Unnamed: 0,date_report,emploee_position,emp_age,report-start_job,finish_job-report,SOURCE_SYSTEM,OPERATION,CHANNEL_GR,urf_code_uni,QUANTITY,PRODUCT_NAME,vl_up,insert-report,update-report,client_age,client_is_emploee,emploee_in_vacation,emp_oper_per_day,emp_oper_per_month,day_of_week,client_product_per_day,client_product_at_moment,empcl_amt,empcl_diff_dates_before,empcl_diff_dates_after,clemp_amt,clemp_diff_dates_before,clemp_diff_dates_after


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [10]:
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes, y_train)
class_weights = dict(zip(classes, weights))

In [11]:
y_train.value_counts()

0    10536364
1     1113798
Name: target, dtype: int64

In [12]:
y_test.value_counts()

0    4514617
1     478310
Name: target, dtype: int64

In [13]:
train_dataset = Pool(X_train, y_train, cat_features=cat_features)
test_dataset = Pool(X_test, y_test, cat_features=cat_features)

In [14]:
clf = CatBoostClassifier(class_weights=class_weights, task_type='GPU', iterations=300)

In [13]:
#clf.fit(train_dataset, verbose=10)

Learning rate set to 0.020143
0:	learn: 0.6552272	total: 571ms	remaining: 9m 30s
10:	learn: 0.4213749	total: 6.03s	remaining: 9m 1s
20:	learn: 0.2986361	total: 12.3s	remaining: 9m 31s


KeyboardInterrupt: 

In [15]:
params = {'depth': [3, 6, 10],#range(1, 17, 5),
          #'iterations': [250, 500, 750],
          #'learning_rate': [0.001, 0.01, 0.1],
          'l2_leaf_reg': [1, 3, 5],
          #'border_count': range(1, 201, 50),
         }

In [16]:
grid_search_result = clf.grid_search(params, train_dataset, cv=5)

0:	learn: 0.6523945	test: 0.6523119	best: 0.6523119 (0)	total: 320ms	remaining: 1m 35s
1:	learn: 0.6151605	test: 0.6150038	best: 0.6150038 (1)	total: 635ms	remaining: 1m 34s
2:	learn: 0.5812612	test: 0.5810349	best: 0.5810349 (2)	total: 950ms	remaining: 1m 34s
3:	learn: 0.5502780	test: 0.5499852	best: 0.5499852 (3)	total: 1.27s	remaining: 1m 33s
4:	learn: 0.5236544	test: 0.5233008	best: 0.5233008 (4)	total: 1.46s	remaining: 1m 26s
5:	learn: 0.4995327	test: 0.4991288	best: 0.4991288 (5)	total: 1.77s	remaining: 1m 26s
6:	learn: 0.4729799	test: 0.4724978	best: 0.4724978 (6)	total: 2.09s	remaining: 1m 27s
7:	learn: 0.4479265	test: 0.4473809	best: 0.4473809 (7)	total: 2.42s	remaining: 1m 28s
8:	learn: 0.4251681	test: 0.4245591	best: 0.4245591 (8)	total: 2.75s	remaining: 1m 28s
9:	learn: 0.4045938	test: 0.4039309	best: 0.4039309 (9)	total: 3.08s	remaining: 1m 29s
10:	learn: 0.3861052	test: 0.3853949	best: 0.3853949 (10)	total: 3.4s	remaining: 1m 29s
11:	learn: 0.3693826	test: 0.3686282	best:

CatBoostError: catboost/libs/train_lib/cross_validation.cpp:92: ClassLabels in dataprocessing options and in training data must match

In [None]:
rand_search_catboost = clf.randomized_search(params, train_dataset, cv=5)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.6392052	test: 0.6391197	best: 0.6391197 (0)	total: 443ms	remaining: 2m 12s
1:	learn: 0.5941003	test: 0.5939402	best: 0.5939402 (1)	total: 920ms	remaining: 2m 17s
2:	learn: 0.5503883	test: 0.5501508	best: 0.5501508 (2)	total: 1.35s	remaining: 2m 13s
3:	learn: 0.5189335	test: 0.5186241	best: 0.5186241 (3)	total: 1.86s	remaining: 2m 17s
4:	learn: 0.4893632	test: 0.4889842	best: 0.4889842 (4)	total: 2.37s	remaining: 2m 19s
5:	learn: 0.4615907	test: 0.4611457	best: 0.4611457 (5)	total: 2.84s	remaining: 2m 19s
6:	learn: 0.4386208	test: 0.4381186	best: 0.4381186 (6)	total: 3.3s	remaining: 2m 18s
7:	learn: 0.4177598	test: 0.4172071	best: 0.4172071 (7)	total: 3.76s	remaining: 2m 17s
8:	learn: 0.3963515	test: 0.3957575	best: 0.3957575 (8)	total: 4.23s	remaining: 2m 16s
9:	learn: 0.3736755	test: 0.3730298	best: 0.3730298 (9)	total: 4.77s	remaining: 2m 18s
10:	learn: 0.3553287	test: 0.3546305	best: 0.3546305 (10)	total: 5.21s	remaining: 2m 17s
11:	learn: 0.3374976	test: 0.3367590	best: