In [1]:
from datetime import timedelta

import pandas as pd
import numpy as np

from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

In [2]:
data = pd.read_csv('train_dataset_train.csv', sep=';', dtype={'VISIT_MONTH_YEAR': str})

In [3]:
data['MONTH'] = data['VISIT_MONTH_YEAR'].str.split('.').str.get(0).astype(int)
data['YEAR'] = data['VISIT_MONTH_YEAR'].str.split('.').str.get(1).astype(int)
data.drop('VISIT_MONTH_YEAR', axis=1, inplace=True)
data['CODE_TYPE'] = data['MKB_CODE'].str[0]
data['CODE_CATEGORY'] = data['MKB_CODE'].str[:3]

In [4]:
data.sort_values(['YEAR', 'MONTH'], inplace=True)

In [5]:
adres_stat = data.groupby('ADRES')['PATIENT_ID_COUNT'].sum().reset_index()

In [6]:
adres_stat

Unnamed: 0,ADRES,PATIENT_ID_COUNT
0,Багратионовск,119629
1,Балтийск,227475
2,Березовка,4254
3,Большаково,9069
4,Большое Исаково,40875
...,...,...
113,Шоссейное,11847
114,Южный,7784
115,Янтарный,31022
116,Ясная Поляна,3432


In [7]:
adres_stat['size'] = 0
adres_stat.loc[adres_stat['PATIENT_ID_COUNT'] > 1000, 'size'] = 1
adres_stat.loc[adres_stat['PATIENT_ID_COUNT'] > 10000, 'size'] = 2
adres_stat.loc[adres_stat['PATIENT_ID_COUNT'] > 100000, 'size'] = 3
adres_stat.loc[adres_stat['PATIENT_ID_COUNT'] > 1000000, 'size'] = 4

In [8]:
adres_stat.drop('PATIENT_ID_COUNT', axis=1, inplace=True)

In [9]:
data = data.merge(adres_stat, how='left')

In [10]:
test = pd.read_csv('test_dataset_test.csv', sep=';', dtype={'VISIT_MONTH_YEAR': str})
test['MONTH'] = test['VISIT_MONTH_YEAR'].str.split('.').str.get(0).astype(int)
test['YEAR'] = test['VISIT_MONTH_YEAR'].str.split('.').str.get(1).astype(int)
test.drop('VISIT_MONTH_YEAR', axis=1, inplace=True)
test['CODE_TYPE'] = test['MKB_CODE'].str[0]
test['CODE_CATEGORY'] = test['MKB_CODE'].str[:3]
test = test.merge(adres_stat, how='left')

In [11]:
data[['YEAR', 'MONTH']].iloc[0], data[['YEAR', 'MONTH']].iloc[-1]

(YEAR     18
 MONTH     1
 Name: 0, dtype: int64,
 YEAR     22
 MONTH     3
 Name: 2212392, dtype: int64)

In [12]:
test[['YEAR', 'MONTH']].iloc[0], test[['YEAR', 'MONTH']].iloc[-1]

(YEAR     22
 MONTH     4
 Name: 0, dtype: int64,
 YEAR     22
 MONTH     4
 Name: 39372, dtype: int64)

In [13]:
data['DAY'] = 1
test['DAY'] = 1

In [14]:
data['YEAR'] = 2000+data['YEAR']
test['YEAR'] = 2000+test['YEAR']

In [15]:
data['DATE'] = pd.to_datetime(data[['YEAR', 'MONTH', 'DAY']])
test['DATE'] = pd.to_datetime(test[['YEAR', 'MONTH', 'DAY']])

In [16]:
data['PREV_MONTH'] = (data['DATE'] - timedelta(days=28)).dt.to_period('M')
test['PREV_MONTH'] = (test['DATE'] - timedelta(days=28)).dt.to_period('M')

In [17]:
data['PREV_2_MONTH'] = (data['DATE'] - timedelta(days=58)).dt.to_period('M')
test['PREV_2_MONTH'] = (test['DATE'] - timedelta(days=58)).dt.to_period('M')

In [18]:
data['PREV_YEAR'] = (data['DATE'] - timedelta(days=365)).dt.to_period('M')
test['PREV_YEAR'] = (test['DATE'] - timedelta(days=365)).dt.to_period('M')

In [19]:
data['PREV_11_MONTH'] = (data['DATE'] - timedelta(days=365-28)).dt.to_period('M')
test['PREV_11_MONTH'] = (test['DATE'] - timedelta(days=365-28)).dt.to_period('M')

In [20]:
data['PREV_13_MONTH'] = (data['DATE'] - timedelta(days=365+28)).dt.to_period('M')
test['PREV_13_MONTH'] = (test['DATE'] - timedelta(days=365+28)).dt.to_period('M')

In [21]:
data['DATE'] = data['DATE'].dt.to_period('M')
test['DATE'] = test['DATE'].dt.to_period('M')

In [22]:
data

Unnamed: 0,PATIENT_SEX,MKB_CODE,ADRES,AGE_CATEGORY,PATIENT_ID_COUNT,MONTH,YEAR,CODE_TYPE,CODE_CATEGORY,size,DAY,DATE,PREV_MONTH,PREV_2_MONTH,PREV_YEAR,PREV_11_MONTH,PREV_13_MONTH
0,0,A02.0,Калининград,children,3,1,2018,A,A02,4,1,2018-01,2017-12,2017-11,2017-01,2017-01,2016-12
1,0,A02.0,Калининград,elderly,1,1,2018,A,A02,4,1,2018-01,2017-12,2017-11,2017-01,2017-01,2016-12
2,0,A02.0,Калининград,young,2,1,2018,A,A02,4,1,2018-01,2017-12,2017-11,2017-01,2017-01,2016-12
3,0,A02.0,СТ Искра ул. Тюльпановая,children,1,1,2018,A,A02,1,1,2018-01,2017-12,2017-11,2017-01,2017-01,2016-12
4,0,A02,Калининград,children,1,1,2018,A,A02,4,1,2018-01,2017-12,2017-11,2017-01,2017-01,2016-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2212388,1,Z96.6,Калининград,middleage,1,3,2022,Z,Z96,4,1,2022-03,2022-02,2022-01,2021-03,2021-03,2021-02
2212389,1,Z96.6,Пионерский,middleage,1,3,2022,Z,Z96,3,1,2022-03,2022-02,2022-01,2021-03,2021-03,2021-02
2212390,1,Z98.8,Калининград,children,1,3,2022,Z,Z98,4,1,2022-03,2022-02,2022-01,2021-03,2021-03,2021-02
2212391,1,Z98.8,Озерск,elderly,1,3,2022,Z,Z98,2,1,2022-03,2022-02,2022-01,2021-03,2021-03,2021-02


In [23]:
data = data.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month'), how='left')
data = data.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year'), how='left')
data = data.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month'), how='left')
data = data.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month'), how='left')
data = data.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month'), how='left')

In [24]:
agg_data = data.groupby(['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'])['PATIENT_ID_COUNT'].mean().reset_index()
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month_size'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year_size'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month_size'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month_size'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month_size'), how='left')

In [25]:
agg_data = data.groupby(['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'])['PATIENT_ID_COUNT'].mean().reset_index()
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month_code_type'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year_code_type'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month_code_type'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month_code_type'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month_code_type'), how='left')

In [26]:
agg_data = data.groupby(['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'])['PATIENT_ID_COUNT'].mean().reset_index()
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month_code_category'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year_code_category'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month_code_category'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month_code_category'), how='left')
data = data.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month_code_category'), how='left')

In [27]:
test = test.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month'), how='left')
test = test.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year'), how='left')
test = test.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month'), how='left')
test = test.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month'), how='left')
test = test.merge(data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT', 'DATE']], 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month'), how='left')

In [28]:
agg_data = data.groupby(['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'])['PATIENT_ID_COUNT'].mean().reset_index()
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month_size'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year_size'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month_size'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month_size'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'MKB_CODE', 'size', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month_size'), how='left')

In [29]:
agg_data = data.groupby(['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'])['PATIENT_ID_COUNT'].mean().reset_index()
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month_code_type'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year_code_type'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month_code_type'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month_code_type'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_TYPE', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month_code_type'), how='left')

In [30]:
agg_data = data.groupby(['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'])['PATIENT_ID_COUNT'].mean().reset_index()
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_month_code_category'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_YEAR'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_year_code_category'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_2_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_2_month_code_category'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_11_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_11_month_code_category'), how='left')
test = test.merge(agg_data, 
                  left_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'PREV_13_MONTH'], 
                  right_on=['PATIENT_SEX', 'CODE_CATEGORY', 'ADRES', 'AGE_CATEGORY', 'DATE'], suffixes=('', '_prev_13_month_code_category'), how='left')

In [31]:
test.rename({'PATIENT_ID_COUNT': 'PATIENT_ID_COUNT_prev_month'}, axis=1, inplace=True)

In [32]:
train = data[(data['YEAR'] <= 2020) | ((data['YEAR'] == 2021) & (data['MONTH'] < 4))]
val = data[(data['YEAR'] == 2021) & (data['MONTH'] == 4)]

In [33]:
train.columns

Index(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'PATIENT_ID_COUNT',
       'MONTH', 'YEAR', 'CODE_TYPE', 'CODE_CATEGORY', 'size', 'DAY', 'DATE',
       'PREV_MONTH', 'PREV_2_MONTH', 'PREV_YEAR', 'PREV_11_MONTH',
       'PREV_13_MONTH', 'PATIENT_ID_COUNT_prev_month', 'DATE_prev_month',
       'PATIENT_ID_COUNT_prev_year', 'DATE_prev_year',
       'PATIENT_ID_COUNT_prev_2_month', 'DATE_prev_2_month',
       'PATIENT_ID_COUNT_prev_11_month', 'DATE_prev_11_month',
       'PATIENT_ID_COUNT_prev_13_month', 'DATE_prev_13_month',
       'DATE_prev_month_size', 'PATIENT_ID_COUNT_prev_month_size',
       'DATE_prev_year_size', 'PATIENT_ID_COUNT_prev_year_size',
       'DATE_prev_2_month_size', 'PATIENT_ID_COUNT_prev_2_month_size',
       'DATE_prev_11_month_size', 'PATIENT_ID_COUNT_prev_11_month_size',
       'DATE_prev_13_month_size', 'PATIENT_ID_COUNT_prev_13_month_size',
       'DATE_prev_month_code_type', 'PATIENT_ID_COUNT_prev_month_code_type',
       'DATE_prev_year_code_type', 

In [34]:
drop_fields = [
    'YEAR', 'DATE', 'PREV_MONTH',  'PREV_YEAR', 'PREV_2_MONTH', 'PREV_11_MONTH', 'PREV_13_MONTH', 
    'DATE_prev_month', 'DATE_prev_year', 'DATE_prev_2_month', 'DATE_prev_11_month', 'DATE_prev_13_month',
    'DATE_prev_month_size', 'DATE_prev_year_size', 'DATE_prev_2_month_size', 'DATE_prev_11_month_size', 'DATE_prev_13_month_size',
    'DATE_prev_month_code_type', 'DATE_prev_year_code_type', 'DATE_prev_2_month_code_type', 'DATE_prev_11_month_code_type', 'DATE_prev_13_month_code_type',
    'DATE_prev_month_code_category', 'DATE_prev_year_code_category', 'DATE_prev_2_month_code_category', 'DATE_prev_11_month_code_category', 'DATE_prev_13_month_code_category',
]
data.drop(drop_fields, axis=1, inplace=True)
train.drop(drop_fields, axis=1, inplace=True)
val.drop(drop_fields, axis=1, inplace=True)
test.drop(drop_fields, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [35]:
train

Unnamed: 0,PATIENT_SEX,MKB_CODE,ADRES,AGE_CATEGORY,PATIENT_ID_COUNT,MONTH,CODE_TYPE,CODE_CATEGORY,size,DAY,...,PATIENT_ID_COUNT_prev_month_code_type,PATIENT_ID_COUNT_prev_year_code_type,PATIENT_ID_COUNT_prev_2_month_code_type,PATIENT_ID_COUNT_prev_11_month_code_type,PATIENT_ID_COUNT_prev_13_month_code_type,PATIENT_ID_COUNT_prev_month_code_category,PATIENT_ID_COUNT_prev_year_code_category,PATIENT_ID_COUNT_prev_2_month_code_category,PATIENT_ID_COUNT_prev_11_month_code_category,PATIENT_ID_COUNT_prev_13_month_code_category
0,0,A02.0,Калининград,children,3,1,A,A02,4,1,...,,,,,,,,,,
1,0,A02.0,Калининград,elderly,1,1,A,A02,4,1,...,,,,,,,,,,
2,0,A02.0,Калининград,young,2,1,A,A02,4,1,...,,,,,,,,,,
3,0,A02.0,СТ Искра ул. Тюльпановая,children,1,1,A,A02,1,1,...,,,,,,,,,,
4,0,A02,Калининград,children,1,1,A,A02,4,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1820827,1,Z96.6,Калининград,middleage,3,3,Z,Z96,4,1,...,77.294118,17.050847,41.000000,17.050847,17.019608,1.0,,,,
1820828,1,Z96.6,Калининград,young,1,3,Z,Z96,4,1,...,68.187500,39.273973,40.397059,39.273973,35.388889,1.0,,,,
1820829,1,Z96.6,Озерск,middleage,1,3,Z,Z96,2,1,...,19.333333,1.200000,5.666667,1.200000,3.000000,,,,,
1820830,1,Z96.6,Пионерский,young,1,3,Z,Z96,3,1,...,6.450000,5.571429,5.933333,5.571429,5.538462,,,,,


In [36]:
val_preds = []
for train_index, test_index in KFold(n_splits=5).split(train):

    model = CatBoostRegressor(
        eval_metric='R2',
        early_stopping_rounds=20,
        cat_features=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'CODE_TYPE', 'MONTH', 'size', 'CODE_CATEGORY']
    )
    model.fit(
        train.iloc[train_index].drop('PATIENT_ID_COUNT', axis=1), train.iloc[train_index]['PATIENT_ID_COUNT'], 
        eval_set=(val.drop('PATIENT_ID_COUNT', axis=1), val['PATIENT_ID_COUNT'])
    )
    val_preds.append(model.predict(val.drop('PATIENT_ID_COUNT', axis=1)))

Learning rate set to 0.159788
0:	learn: 0.2410143	test: 0.2035919	best: 0.2035919 (0)	total: 672ms	remaining: 11m 11s
1:	learn: 0.4191100	test: 0.3323445	best: 0.3323445 (1)	total: 1.28s	remaining: 10m 39s
2:	learn: 0.5497697	test: 0.4614591	best: 0.4614591 (2)	total: 1.77s	remaining: 9m 47s
3:	learn: 0.6443299	test: 0.5449829	best: 0.5449829 (3)	total: 2.24s	remaining: 9m 17s
4:	learn: 0.7131086	test: 0.5896533	best: 0.5896533 (4)	total: 2.75s	remaining: 9m 6s
5:	learn: 0.7642505	test: 0.6237408	best: 0.6237408 (5)	total: 3.22s	remaining: 8m 53s
6:	learn: 0.8013997	test: 0.6635546	best: 0.6635546 (6)	total: 3.61s	remaining: 8m 32s
7:	learn: 0.8282829	test: 0.6930999	best: 0.6930999 (7)	total: 3.99s	remaining: 8m 14s
8:	learn: 0.8494397	test: 0.7026951	best: 0.7026951 (8)	total: 4.35s	remaining: 7m 59s
9:	learn: 0.8658444	test: 0.7152265	best: 0.7152265 (9)	total: 4.75s	remaining: 7m 50s
10:	learn: 0.8778080	test: 0.7319890	best: 0.7319890 (10)	total: 5.19s	remaining: 7m 47s
11:	learn:

In [37]:
pred = np.array(val_preds).mean(axis=0) * 1.2

r2_score(val['PATIENT_ID_COUNT'], pred.clip(min=1).round())

0.9138139857592371

In [38]:
test_preds = []
for train_index, test_index in KFold(n_splits=5).split(data):

    model = CatBoostRegressor(
        iterations=150,
        learning_rate=0.159788,
        cat_features=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', 'CODE_TYPE', 'MONTH', 'size', 'CODE_CATEGORY']
    )
    model.fit(
        data.iloc[train_index].drop('PATIENT_ID_COUNT', axis=1), data.iloc[train_index]['PATIENT_ID_COUNT']
    )
    test_preds.append(model.predict(test))

0:	learn: 56.1064867	total: 307ms	remaining: 45.8s
1:	learn: 49.5173269	total: 603ms	remaining: 44.6s
2:	learn: 44.0270673	total: 799ms	remaining: 39.1s
3:	learn: 39.5478109	total: 1.06s	remaining: 38.8s
4:	learn: 35.7918175	total: 1.27s	remaining: 36.9s
5:	learn: 32.7613312	total: 1.5s	remaining: 35.9s
6:	learn: 30.2970353	total: 1.69s	remaining: 34.5s
7:	learn: 28.3712741	total: 1.89s	remaining: 33.6s
8:	learn: 26.8520624	total: 2.07s	remaining: 32.4s
9:	learn: 25.5582257	total: 2.24s	remaining: 31.4s
10:	learn: 24.5334912	total: 2.44s	remaining: 30.9s
11:	learn: 23.6348023	total: 2.64s	remaining: 30.4s
12:	learn: 22.9150676	total: 2.83s	remaining: 29.8s
13:	learn: 22.1726549	total: 3.04s	remaining: 29.5s
14:	learn: 21.6774762	total: 3.23s	remaining: 29.1s
15:	learn: 21.1855968	total: 3.44s	remaining: 28.8s
16:	learn: 20.7193219	total: 3.62s	remaining: 28.3s
17:	learn: 20.2458540	total: 3.79s	remaining: 27.8s
18:	learn: 19.9222161	total: 3.98s	remaining: 27.5s
19:	learn: 19.6617948	t

In [39]:
pred = np.array(test_preds).mean(axis=0) * 1.2

In [40]:
test = pd.read_csv('test_dataset_test.csv', sep=';', dtype={'VISIT_MONTH_YEAR': str})
test['PATIENT_ID_COUNT'] = pred.clip(min=1).round().astype(int)

In [41]:
test['PATIENT_ID_COUNT']

0        3
1        2
2        2
3        2
4        2
        ..
39368    2
39369    2
39370    2
39371    2
39372    2
Name: PATIENT_ID_COUNT, Length: 39373, dtype: int64

In [42]:
test.to_csv('submission.csv', sep=';', index=False)

In [43]:
list(zip(model.feature_names_, model.feature_importances_))

[('PATIENT_SEX', 0.07991665126733129),
 ('MKB_CODE', 2.7520095169641596),
 ('ADRES', 0.4728776938850512),
 ('AGE_CATEGORY', 1.3407099381705967),
 ('MONTH', 2.1077550641560188),
 ('CODE_TYPE', 0.4908465108969259),
 ('CODE_CATEGORY', 0.6805861988469861),
 ('size', 1.4808584728274818),
 ('DAY', 0.0),
 ('PATIENT_ID_COUNT_prev_month', 60.162200147288615),
 ('PATIENT_ID_COUNT_prev_year', 6.706338347920101),
 ('PATIENT_ID_COUNT_prev_2_month', 2.9130260575293105),
 ('PATIENT_ID_COUNT_prev_11_month', 2.187346639544919),
 ('PATIENT_ID_COUNT_prev_13_month', 3.505136624768918),
 ('PATIENT_ID_COUNT_prev_month_size', 5.773624448408061),
 ('PATIENT_ID_COUNT_prev_year_size', 0.1206114168695087),
 ('PATIENT_ID_COUNT_prev_2_month_size', 0.14970380058718077),
 ('PATIENT_ID_COUNT_prev_11_month_size', 0.3600057535370258),
 ('PATIENT_ID_COUNT_prev_13_month_size', 0.10354525155020981),
 ('PATIENT_ID_COUNT_prev_month_code_type', 2.6611391828622),
 ('PATIENT_ID_COUNT_prev_year_code_type', 0.6527224923785729),
