# Импорт библиотек

In [1]:
import os
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import *

from dateutil.relativedelta import relativedelta

# Метрика

In [2]:
np.random.seed(17)
gt = np.random.randint(0, 5, size=100,)
pr = np.random.randint(0, 5, size=100)

In [3]:
def compete_metric(gt, pr):

    res = 0
    norm_coef = [4, 3, 2, 3, 4]
    
    for j in range(5):
        
        diff = 0
        N = 0
    
        for i in range(gt.shape[0]):
            
            gt_val = gt[i]
            pr_val = pr[i]
            
            if gt_val == j:
            
                diff += abs(pr_val - gt_val)
                N += 1
                
        res += diff / N / norm_coef[j]
        
    return 1 - res / 5

In [4]:
compete_metric(gt, pr)

0.5213859456826209

# Создание датасета для обучения

In [73]:
# 2. Файл с результатами КТ-исследований (train) и файл аналогичной структуры для валидации результатов (evaluate).
# Необходимо проставить классы в таком же формате (КТ-0-4), в каком они указаны в исходном файле.
# IDX - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# DATE - Дата исследования
# KT_Result - Результат
kt_train = pd.read_csv('data/data_for_science_KT_train.csv', sep=',')

kt_test = pd.read_csv('data/data_for_science_KT_evaluate.csv', sep=',')
kt_train.head(1)

Unnamed: 0,idx,DATE,KT_RESULT
0,301478,2020-04-17,КТ-1


In [376]:
data_train = kt_train.copy()
data_test = kt_test.copy()

## Работа с kt

### Обработка kt_train и удаление грязи

In [377]:
# перевожу кт_резалт в инт
data_train.KT_RESULT = data_train.KT_RESULT.apply(lambda x: int(x.split('-')[-1]))

In [378]:
# убираю пользователей у которых в одну дату был не один диагноз
temp = data_train.groupby(['idx', 'DATE']).KT_RESULT.nunique().reset_index().rename(columns={'KT_RESULT': 'val'})
data_train = pd.merge(data_train, temp, on=['idx', 'DATE'], how='left')
data_train = data_train[data_train.val == 1].reset_index(drop=True).drop('val', axis=1)

In [379]:
data_train.groupby(['idx', 'DATE']).KT_RESULT.nunique().sort_values().reset_index()

Unnamed: 0,idx,DATE,KT_RESULT
0,6,2020-05-16,1
1,589147,2020-05-06,1
2,589147,2020-04-20,1
3,589146,2020-05-05,1
4,589141,2020-04-22,1
...,...,...,...
131984,172690,2020-05-26,1
131985,172682,2020-05-26,1
131986,172673,2020-05-26,1
131987,172483,2020-05-01,1


In [380]:
kt_train.groupby('KT_RESULT').idx.count()

KT_RESULT
КТ-0    48092
КТ-1    55685
КТ-2    21231
КТ-3     6678
КТ-4      603
Name: idx, dtype: int64

In [381]:
data_train.groupby('KT_RESULT').idx.count()

KT_RESULT
0    47984
1    55564
2    21171
3     6667
4      603
Name: idx, dtype: int64

In [382]:
data_test[data_test.idx.isin(data_train.idx.unique())]

Unnamed: 0,idx,DATE,KT_RESULT


### Добавление признака предыдущих КТ

In [383]:
data_train = data_train.sort_values(['idx', 'DATE']).reset_index(drop=True)
data_test = data_test.sort_values(['idx', 'DATE']).reset_index(drop=True)

In [384]:
# последний КТ

In [385]:
shift_train = data_train[~data_train[['idx', 'DATE']].duplicated()].copy()
shift_test = data_test[~data_test[['idx', 'DATE']].duplicated()].copy()

In [386]:
shift_train['shift_DATE'] = shift_train.groupby(['idx']).DATE.shift(1)
shift_train['days_btw_last_kt'] = (pd.to_datetime(shift_train.DATE) 
                                   - pd.to_datetime(shift_train.shift_DATE)).apply(lambda x: x.days).fillna(0)

shift_test['shift_DATE'] = shift_test.groupby(['idx']).DATE.shift(1)
shift_test['days_btw_last_kt'] = (pd.to_datetime(shift_test.DATE) 
                                  - pd.to_datetime(shift_test.shift_DATE)).apply(lambda x: x.days).fillna(0)

In [387]:
shift_train[shift_train.shift_DATE.notnull()]

Unnamed: 0,idx,DATE,KT_RESULT,shift_DATE,days_btw_last_kt
7,60,2020-05-17,1,2020-05-16,1.0
14,92,2020-06-29,0,2020-05-17,43.0
22,157,2020-05-16,0,2020-05-05,11.0
27,173,2020-05-17,1,2020-05-07,10.0
29,177,2020-06-06,0,2020-05-17,20.0
...,...,...,...,...,...
131686,1277418,2020-07-04,2,2020-06-29,5.0
131720,1280015,2020-05-07,1,2020-04-26,11.0
131755,1283676,2020-07-11,1,2020-07-02,9.0
131816,1290593,2020-07-07,1,2020-06-14,23.0


In [388]:
shift_test.days_btw_last_kt.describe()

count    33037.000000
mean         3.225384
std          9.480197
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         81.000000
Name: days_btw_last_kt, dtype: float64

In [389]:
# количество КТ

In [390]:
# максимальное количество КТ на человека
print(max(shift_train.groupby(['idx']).DATE.nunique().max(), shift_test.groupby(['idx']).DATE.nunique().max()))

7


In [391]:
# делаем шифт до 7, послее считаем ненулевые

In [392]:
for i in range(1, 8):
    shift_train['shift{}'.format(i)] = shift_train.groupby(['idx']).DATE.shift(i).notnull()
    
shift_train['KT_number'] = shift_train[shift_train.columns[5:]].apply(lambda x: x.sum(), axis=1)

for i in range(1, 8):
    shift_test['shift{}'.format(i)] = shift_test.groupby(['idx']).DATE.shift(i).notnull()
    
shift_test['KT_number'] = shift_test[shift_test.columns[5:]].apply(lambda x: x.sum(), axis=1)

In [393]:
shift_train[shift_train['KT_number'] > 5]

Unnamed: 0,idx,DATE,KT_RESULT,shift_DATE,days_btw_last_kt,shift1,shift2,shift3,shift4,shift5,shift6,shift7,KT_number
5199,24062,2020-06-17,0,2020-05-31,17.0,True,True,True,True,True,True,False,6
41440,158709,2020-06-27,1,2020-06-07,20.0,True,True,True,True,True,True,False,6


In [394]:
shift_test[shift_test['KT_number'] > 5]

Unnamed: 0,idx,DATE,KT_RESULT,shift_DATE,days_btw_last_kt,shift1,shift2,shift3,shift4,shift5,shift6,shift7,KT_number
7747,121260,2020-06-27,,2020-06-23,4.0,True,True,True,True,True,True,False,6


In [395]:
# объединение результатов с data

In [396]:
data_train = pd.merge(data_train, shift_train[['idx', 'DATE', 'days_btw_last_kt', 'KT_number']],
                      on=['idx', 'DATE'], how='left')

data_test = pd.merge(data_test, shift_test[['idx', 'DATE', 'days_btw_last_kt', 'KT_number']],
                     on=['idx', 'DATE'], how='left')

### Добавление признака будущих КТ

In [397]:
data_train = data_train.sort_values(['idx', 'DATE']).reset_index(drop=True)
data_test = data_test.sort_values(['idx', 'DATE']).reset_index(drop=True)

In [398]:
# последний КТ

In [399]:
shift_train = data_train[~data_train[['idx', 'DATE']].duplicated()][['idx', 'DATE']].copy()
shift_test = data_test[~data_test[['idx', 'DATE']].duplicated()][['idx', 'DATE']].copy()

In [400]:
shift_train['shift_DATE'] = shift_train.groupby(['idx']).DATE.shift(-1)
shift_train['days_btw_next_kt'] = (pd.to_datetime(shift_train.shift_DATE) 
                                   - pd.to_datetime(shift_train.DATE)).apply(lambda x: x.days).fillna(0)

shift_test['shift_DATE'] = shift_test.groupby(['idx']).DATE.shift(-1)
shift_test['days_btw_next_kt'] = (pd.to_datetime(shift_test.shift_DATE) 
                                  - pd.to_datetime(shift_test.DATE)).apply(lambda x: x.days).fillna(0)

In [401]:
shift_train[shift_train.shift_DATE.notnull()]

Unnamed: 0,idx,DATE,shift_DATE,days_btw_next_kt
6,60,2020-05-16,2020-05-17,1.0
13,92,2020-05-17,2020-06-29,43.0
21,157,2020-05-05,2020-05-16,11.0
26,173,2020-05-07,2020-05-17,10.0
28,177,2020-05-17,2020-06-06,20.0
...,...,...,...,...
131685,1277418,2020-06-29,2020-07-04,5.0
131719,1280015,2020-04-26,2020-05-07,11.0
131754,1283676,2020-07-02,2020-07-11,9.0
131815,1290593,2020-06-14,2020-07-07,23.0


In [402]:
shift_test.days_btw_next_kt.describe()

count    33037.000000
mean         3.225384
std          9.480197
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         81.000000
Name: days_btw_next_kt, dtype: float64

In [403]:
# количество КТ

In [404]:
# максимальное количество КТ на человека
print(max(shift_train.groupby(['idx']).DATE.nunique().max(), shift_test.groupby(['idx']).DATE.nunique().max()))

7


In [405]:
# делаем шифт до 7, послее считаем ненулевые

In [406]:
for i in range(1, 8):
    shift_train['shift{}'.format(-i)] = shift_train.groupby(['idx']).DATE.shift(-i).notnull()
    
shift_train['next_KT_number'] = shift_train[shift_train.columns[5:]].apply(lambda x: x.sum(), axis=1)

for i in range(1, 8):
    shift_test['shift{}'.format(-i)] = shift_test.groupby(['idx']).DATE.shift(-i).notnull()
    
shift_test['next_KT_number'] = shift_test[shift_test.columns[5:]].apply(lambda x: x.sum(), axis=1)

In [407]:
shift_train[shift_train['next_KT_number'] > 4]

Unnamed: 0,idx,DATE,shift_DATE,days_btw_next_kt,shift-1,shift-2,shift-3,shift-4,shift-5,shift-6,shift-7,next_KT_number
5193,24062,2020-04-28,2020-05-07,9.0,True,True,True,True,True,True,False,5
41434,158709,2020-04-25,2020-05-02,7.0,True,True,True,True,True,True,False,5


In [408]:
shift_test[shift_test['next_KT_number'] > 4]

Unnamed: 0,idx,DATE,shift_DATE,days_btw_next_kt,shift-1,shift-2,shift-3,shift-4,shift-5,shift-6,shift-7,next_KT_number
7741,121260,2020-05-07,2020-05-08,1.0,True,True,True,True,True,True,False,5


In [409]:
# объединение результатов с data

In [410]:
data_train = pd.merge(data_train, shift_train[['idx', 'DATE', 'days_btw_next_kt', 'next_KT_number']],
                      on=['idx', 'DATE'], how='left')

data_test = pd.merge(data_test, shift_test[['idx', 'DATE', 'days_btw_next_kt', 'next_KT_number']],
                     on=['idx', 'DATE'], how='left')

In [411]:
data_train

Unnamed: 0,idx,DATE,KT_RESULT,days_btw_last_kt,KT_number,days_btw_next_kt,next_KT_number
0,6,2020-05-16,0,0.0,0,0.0,0
1,11,2020-05-16,0,0.0,0,0.0,0
2,21,2020-05-16,0,0.0,0,0.0,0
3,28,2020-05-16,0,0.0,0,0.0,0
4,51,2020-05-16,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...
131984,1297419,2020-07-01,1,0.0,0,0.0,0
131985,1297453,2020-07-02,2,0.0,0,0.0,0
131986,1297466,2020-07-05,1,0.0,0,0.0,0
131987,1297498,2020-07-02,0,0.0,0,0.0,0


### Количество КТ в этот день

In [412]:
in_shift_train = data_train.groupby(['idx', 'DATE']).KT_number.count().sort_values().reset_index() \
                           .rename(columns={'KT_number': 'in_KT_number'})
in_shift_test = data_test.groupby(['idx', 'DATE']).KT_number.count().sort_values().reset_index() \
                         .rename(columns={'KT_number': 'in_KT_number'})

In [413]:
data_train = pd.merge(data_train, in_shift_train,
                      on=['idx', 'DATE'], how='left')

data_test = pd.merge(data_test, in_shift_test,
                     on=['idx', 'DATE'], how='left')

In [414]:
print(data_train.shape)
print(data_test.shape)

(131989, 8)
(33080, 8)


## Работа с death

In [415]:
# 3. Файл с датами смерти
# IDX - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# DATE - Дата смерти
death = pd.read_csv('data/data_for_science_death_2020-07-27.csv', sep=';')

death = death[death.idx.isin(pd.concat([data_train[['idx', 'DATE']], 
                                        data_test[['idx', 'DATE']]]).idx.unique())].reset_index(drop=True)

death = pd.merge(death, 
                 pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]), 
                 on='idx', how='left')

death.head(1)

Unnamed: 0,idx,DEATH_DATE,DATE
0,566901,2020-04-22,2020-04-18


In [416]:
# days b4 death
death['days_b4_death'] = (pd.to_datetime(death.DEATH_DATE) - pd.to_datetime(death.DATE)).apply(lambda x: x.days)

In [417]:
# fact of death
death['is_death'] = 1

In [418]:
death.days_b4_death.describe()

count    1775.000000
mean       18.102535
std        13.764269
min         0.000000
25%         9.000000
50%        14.000000
75%        23.000000
max        84.000000
Name: days_b4_death, dtype: float64

In [419]:
data_train = pd.merge(data_train, death[['idx', 'DATE', 'days_b4_death', 'is_death']],
                      on=['idx', 'DATE'], how='left').drop_duplicates()
data_train['is_death'] = data_train['is_death'].fillna(0)

data_test = pd.merge(data_test, death[['idx', 'DATE', 'days_b4_death', 'is_death']],
                     on=['idx', 'DATE'], how='left')
data_test['is_death'] = data_test['is_death'].fillna(0)

In [420]:
print(data_train.shape)
print(data_test.shape)

(131989, 10)
(33080, 10)


## Работа с lis

Для начала сколько и каких анализов делалось до КТ

In [421]:
# 4. Файл с результатами лабораторных исследований
# idx - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# ISSL_NAME - Наименование исследования (в одном исследовании может быть несколько тестов)
# TEST_NAME - Наименование теста (показателя)
# TEST_VAL - Результат теста (в формате текста для качественных результатов 
# или в виде ЗНАЧЕНИЕ[_]ЕДИНИЦА ИЗМЕРЕНИЯ для количественных результатов)
# TEST_TIME - Дата результата
lis = pd.read_csv('data/data_for_science_LIs_2020-07-27.csv', sep=';')

lis = lis[lis.idx.isin(pd.concat([data_train[['idx', 'DATE']], 
                                  data_test[['idx', 'DATE']]]).idx.unique())].reset_index(drop=True)

lis.head(1)

Unnamed: 0,idx,ISSL_NAME,TEST_NAME,TEST_VAL,TEST_TIME
0,97068,Клинический анализ крови,Абсолютное количество моноцитов,0_10^9/л,2020-04-20 00:10:54


In [422]:
lis.ISSL_NAME.nunique()

35

In [423]:
lis.TEST_NAME.nunique()

278

In [424]:
lis.TEST_VAL.nunique()

54242

In [425]:
lis.groupby(['ISSL_NAME']).TEST_NAME.nunique().reset_index().TEST_NAME.sum()

694

In [426]:
test_name_unique = lis.TEST_NAME.unique()

In [427]:
len(test_name_unique)

278

In [428]:
def dice_coefficient(a, b):
    a_bigrams = set(a)
    b_bigrams = set(b)
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))

dupl_dict = {}

for i in range(len(test_name_unique)):
    for j in range(len(test_name_unique)):
        
        if i != j:
            
            if dice_coefficient(test_name_unique[i], test_name_unique[j]) > 0.99 \
            and test_name_unique[j] not in dupl_dict:
                
                dupl_dict[test_name_unique[i]] = test_name_unique[j]

In [429]:
dupl_dict

{'Среднее содержание гемоглобина в эритроците (MCH)': 'Среднее содержание  гемоглобина в эритроците (MCH)',
 'Абсолютное количество нейтрофилов': 'Абсолютное  количество нейтрофилов',
 'Эритроциты неизмененные': 'Эритроциты измененные',
 'Кристалы Шарко-Лейдена': 'Кристаллы Шарко-Лейдена',
 'Кристаллы трипельфосфаты': 'Кристаллы трипельфосфата',
 'Растительная клетчатка перевариваемая': 'Растительная неперевариваемая клечатка'}

In [430]:
dupl_dict_TEST_NAME = {'Среднее содержание  гемоглобина в эритроците (MCH)': 'Среднее содержание гемоглобина в эритроците (MCH)',
                       'Абсолютное  количество нейтрофилов': 'Абсолютное количество нейтрофилов',
                       'Кристаллы трипельфосфаты': 'Кристаллы трипельфосфата',
                       'Кристалы Шарко-Лейдена': 'Кристаллы Шарко-Лейдена'}

In [431]:
lis = lis.replace({'TEST_NAME': dupl_dict_TEST_NAME})

In [432]:
lis['name'] = lis.apply(lambda x: x[1] + ' ' + x[2], axis=1)
# lis = lis.drop(['ISSL_NAME', 'TEST_NAME'], axis=1)

In [433]:
lis.shape

(2459914, 6)

In [434]:
lis = pd.merge(lis, 
               pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]), 
               on='idx', how='left')

In [435]:
lis.TEST_TIME = pd.to_datetime(lis.TEST_TIME, utc=True)
lis.TEST_TIME = pd.to_datetime(lis.TEST_TIME.apply(lambda x: x.date))

lis.DATE = pd.to_datetime(lis.DATE)

### all time lis

In [436]:
t = lis.groupby(['idx', 'DATE', 'ISSL_NAME']).TEST_VAL.count().reset_index()
print('ISSL_NAME columns number:', t.ISSL_NAME.nunique())

# t_name = lis.groupby(['idx', 'DATE', 'name']).TEST_VAL.count().reset_index()
# print('name columns number:', t_name.name.nunique())

lis_res = pd.get_dummies(t.ISSL_NAME)
lis_res.values[lis_res != 0] = t.TEST_VAL

lis_res_rename_dict = {}
for i, col in enumerate(lis_res.columns):
    lis_res_rename_dict[col] = 'all_lis_feature_{}'.format(i)
lis_res = lis_res.rename(columns=lis_res_rename_dict)
    
# lis_res[['all_lis_feature_{}'.format(i+j) for j in range(1, t_name.name.nunique()+1)]] = pd.get_dummies(t_name.name)
    
lis_res['idx'] = t['idx']
lis_res['DATE'] = t['DATE']

lis_res = lis_res.groupby(['idx', 'DATE']).sum().reset_index()

del t

ISSL_NAME columns number: 35


In [437]:
data_train['DATE'] = pd.to_datetime(data_train['DATE'])
data_test['DATE'] = pd.to_datetime(data_test['DATE'])

In [438]:
data_train = pd.merge(data_train, lis_res, on=['idx', 'DATE'], how='left')
# data_train[lis_res.columns] = data_train[lis_res.columns].fillna(0)

data_test = pd.merge(data_test, lis_res, on=['idx', 'DATE'], how='left')
# data_test[lis_res.columns] = data_test[lis_res.columns].fillna(0)

In [439]:
print(data_train.shape)
print(data_test.shape)

(131989, 45)
(33080, 45)


### До КТ

In [None]:
lis_before = lis[(lis.DATE.notnull())&(lis.TEST_TIME < lis.DATE)].reset_index(drop=True).copy()

In [None]:
lis_before.groupby(['idx', 'DATE', 'name']).TEST_VAL.count().reset_index()

In [None]:
t = lis_before.groupby(['idx', 'DATE', 'ISSL_NAME']).TEST_VAL.count().reset_index()
print('ISSL_NAME columns number:', t.ISSL_NAME.nunique())

t_name = lis_before.groupby(['idx', 'DATE', 'name']).TEST_VAL.count().reset_index()
print('name columns number:', t_name.name.nunique())

lis_res = pd.get_dummies(t.ISSL_NAME)
lis_res.values[lis_res != 0] = t.TEST_VAL

lis_res_rename_dict = {}
for i, col in enumerate(lis_res.columns):
    lis_res_rename_dict[col] = 'lis_feature_{}'.format(i)
lis_res = lis_res.rename(columns=lis_res_rename_dict)
    
lis_res[['lis_feature_{}'.format(i+j) for j in range(1, t_name.name.nunique()+1)]] = pd.get_dummies(t_name.name)
    
lis_res['idx'] = t['idx']
lis_res['DATE'] = t['DATE']

lis_res = lis_res.groupby(['idx', 'DATE']).sum().reset_index()

del t

In [None]:
data_train['DATE'] = pd.to_datetime(data_train['DATE'])
data_test['DATE'] = pd.to_datetime(data_test['DATE'])

In [None]:
data_train = pd.merge(data_train, lis_res, on=['idx', 'DATE'], how='left')
# data_train[lis_res.columns] = data_train[lis_res.columns].fillna(0)

data_test = pd.merge(data_test, lis_res, on=['idx', 'DATE'], how='left')
# data_test[lis_res.columns] = data_test[lis_res.columns].fillna(0)

In [None]:
print(data_train.shape)
print(data_test.shape)

### После КТ

In [273]:
lis_after = lis[(lis.DATE.notnull())&(lis.TEST_TIME > lis.DATE)].reset_index(drop=True).copy()

In [274]:
lis_after.groupby(['idx', 'DATE', 'name']).TEST_VAL.count().reset_index()

Unnamed: 0,idx,DATE,name,TEST_VAL
0,6,2020-05-16,Общий клинический анализ крови (общий анализ +...,1
1,6,2020-05-16,Общий клинический анализ крови (общий анализ +...,1
2,6,2020-05-16,Общий клинический анализ крови (общий анализ +...,1
3,6,2020-05-16,Определение белков острой фазы С-реактивный бе...,1
4,11,2020-05-16,Общий клинический анализ крови (общий анализ +...,1
...,...,...,...,...
1593982,1297421,2020-07-01,Общий клинический анализ крови (общий анализ +...,1
1593983,1297421,2020-07-01,Общий клинический анализ крови (общий анализ +...,1
1593984,1297421,2020-07-01,Общий клинический анализ крови (общий анализ +...,1
1593985,1297421,2020-07-01,Определение белков острой фазы С-реактивный бе...,2


In [275]:
t = lis_after.groupby(['idx', 'DATE', 'ISSL_NAME']).TEST_VAL.count().reset_index()
print('ISSL_NAME columns number:', t.ISSL_NAME.nunique())

t_name = lis_after.groupby(['idx', 'DATE', 'name']).TEST_VAL.count().reset_index()
print('ISSL_NAME columns number:', t_name.name.nunique())

lis_res = pd.get_dummies(t.ISSL_NAME)
lis_res.values[lis_res != 0] = t.TEST_VAL

lis_res_rename_dict = {}
for i, col in enumerate(lis_res.columns):
    lis_res_rename_dict[col] = 'lis_feature_after_{}'.format(i)
lis_res = lis_res.rename(columns=lis_res_rename_dict)
    
lis_res[['lis_feature_after_{}'.format(i+j) for j in range(1, t_name.name.nunique()+1)]] = pd.get_dummies(t_name.name)
    
lis_res['idx'] = t['idx']
lis_res['DATE'] = t['DATE']

lis_res = lis_res.groupby(['idx', 'DATE']).sum().reset_index()

del t

ISSL_NAME columns number: 34
ISSL_NAME columns number: 682


  self[k1] = value[k2]
  lis_res['idx'] = t['idx']
  lis_res['DATE'] = t['DATE']


In [276]:
data_train = pd.merge(data_train, lis_res, on=['idx', 'DATE'], how='left')
# data_train[lis_res.columns] = data_train[lis_res.columns].fillna(0)

data_test = pd.merge(data_test, lis_res, on=['idx', 'DATE'], how='left')
# data_test[lis_res.columns] = data_test[lis_res.columns].fillna(0)

In [277]:
print(data_train.shape)
print(data_test.shape)

(131989, 3463)
(33080, 3463)


### В день КТ

In [278]:
lis_in = lis[(lis.DATE.notnull())&(lis.TEST_TIME == lis.DATE)].reset_index(drop=True).copy()

In [279]:
lis_in.groupby(['idx', 'DATE', 'name']).TEST_VAL.count().reset_index()

Unnamed: 0,idx,DATE,name,TEST_VAL
0,60,2020-05-17,Общий клинический анализ крови (общий анализ +...,1
1,60,2020-05-17,Общий клинический анализ крови (общий анализ +...,1
2,60,2020-05-17,Общий клинический анализ крови (общий анализ +...,1
3,60,2020-05-17,Определение белков острой фазы С-реактивный бе...,1
4,94,2020-05-18,Общий клинический анализ крови (общий анализ +...,1
...,...,...,...,...
845948,1297345,2020-07-03,Общий клинический анализ крови (общий анализ +...,1
845949,1297345,2020-07-03,Общий клинический анализ крови (общий анализ +...,1
845950,1297345,2020-07-03,Общий клинический анализ крови (общий анализ +...,1
845951,1297345,2020-07-03,Общий клинический анализ крови (общий анализ +...,1


In [280]:
t = lis_in.groupby(['idx', 'DATE', 'ISSL_NAME']).TEST_VAL.count().reset_index()
print('ISSL_NAME columns number:', t.ISSL_NAME.nunique())

t_name = lis_in.groupby(['idx', 'DATE', 'name']).TEST_VAL.count().reset_index()
print('ISSL_NAME columns number:', t_name.name.nunique())

lis_res = pd.get_dummies(t.ISSL_NAME)
lis_res.values[lis_res != 0] = t.TEST_VAL

lis_res_rename_dict = {}
for i, col in enumerate(lis_res.columns):
    lis_res_rename_dict[col] = 'lis_feature_in_{}'.format(i)
lis_res = lis_res.rename(columns=lis_res_rename_dict)
    
lis_res[['lis_feature_in_{}'.format(i+j) for j in range(1, t_name.name.nunique()+1)]] = pd.get_dummies(t_name.name)
    
lis_res['idx'] = t['idx']
lis_res['DATE'] = t['DATE']

lis_res = lis_res.groupby(['idx', 'DATE']).sum().reset_index()

del t

ISSL_NAME columns number: 24
ISSL_NAME columns number: 431


  self[k1] = value[k2]
  lis_res['idx'] = t['idx']
  lis_res['DATE'] = t['DATE']


In [281]:
data_train = pd.merge(data_train, lis_res, on=['idx', 'DATE'], how='left')
# data_train[lis_res.columns] = data_train[lis_res.columns].fillna(0)

data_test = pd.merge(data_test, lis_res, on=['idx', 'DATE'], how='left')
# data_test[lis_res.columns] = data_test[lis_res.columns].fillna(0)

In [282]:
print(data_train.shape)
print(data_test.shape)

(131989, 3918)
(33080, 3918)


## Работа с diagnoses

In [440]:
# 6. Файл с диагнозами (не использовался)
# IDX - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# DATE - Дата диагноза
# DIAGNOSIS - Код диагноза
diagnoses = pd.read_csv('data/data_for_science_diagnoses_2020-07-27.csv', sep=';')
diagnoses.head(1)

Unnamed: 0,idx,DATE,DIAGNOSIS
0,870886,2019-04-20,J01.9


In [441]:
diagnoses = pd.merge(diagnoses.rename(columns={'DATE': 'DIAG_DATE'}), 
                     pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]), 
                     on='idx', how='left')
diagnoses = diagnoses[diagnoses.DATE.notnull()].reset_index(drop=True)

diagnoses.DIAG_DATE = pd.to_datetime(diagnoses.DIAG_DATE)
diagnoses.DATE = pd.to_datetime(diagnoses.DATE)

### all diag

In [443]:
diagnoses.groupby(['idx', 'DATE', 'DIAGNOSIS']).DIAG_DATE.count()

idx      DATE        DIAGNOSIS
6        2020-05-16  J06.9        1
11       2020-05-16  J06.9        1
21       2020-05-16  J06.9        1
                     J11.0        1
28       2020-05-16  J06.9        1
                                 ..
1297498  2020-07-02  J11.0        1
1297531  2020-04-30  J04.0        1
                     J04.1        1
                     J04.2        1
                     J11.0        1
Name: DIAG_DATE, Length: 466440, dtype: int64

In [444]:
diagnoses_temp = diagnoses.groupby(['idx', 'DATE', 'DIAGNOSIS']).DIAG_DATE.count().reset_index()

In [450]:
diagnoses_res = pd.get_dummies(diagnoses_temp.DIAGNOSIS)
diagnoses_res.values[diagnoses_res != 0] = diagnoses_temp.DIAG_DATE
diagnoses_res[['idx', 'DATE']] = diagnoses_temp[['idx', 'DATE']]

In [451]:
diagnoses_res = diagnoses_res.groupby(['idx', 'DATE']).sum().reset_index()

In [453]:
data_train.shape[0]

131989

In [454]:
diagnoses_res.DATE = pd.to_datetime(diagnoses_res.DATE)
data_train.DATE = pd.to_datetime(data_train.DATE)
data_test.DATE = pd.to_datetime(data_test.DATE)

In [456]:
data_train = pd.merge(data_train, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_train[diagnoses_res.columns[2:]] = data_train[diagnoses_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_test[diagnoses_res.columns[2:]] = data_test[diagnoses_res.columns[2:]].fillna(0)

In [457]:
print(data_train.shape)
print(data_test.shape)

(131989, 338)
(33080, 338)


### diag before KT

In [458]:
diagnoses_before = diagnoses.copy()
diagnoses_before['days_btw_diag_kt'] = (diagnoses_before.DATE - diagnoses_before.DIAG_DATE).apply(lambda x: x.days)
diagnoses_before = diagnoses_before[diagnoses_before.days_btw_diag_kt > 0].reset_index(drop=True)

In [459]:
diagnoses_before.DIAGNOSIS.nunique()

281

In [460]:
diagnoses_before.shape

(211892, 5)

In [461]:
diagnoses_before.groupby(['idx', 'DATE', 'DIAGNOSIS']).days_btw_diag_kt.min()

idx      DATE        DIAGNOSIS
53       2020-05-05  J06.9         43
                     J35.0         39
85       2020-05-16  I25.1        172
                     J42          226
89       2020-05-17  J04.1         80
                                 ... 
1297345  2020-07-03  J06.9          1
1297357  2020-07-05  J06.9          3
                     J11.0          2
1297411  2020-07-01  J21.9         62
1297531  2020-04-30  J04.2         77
Name: days_btw_diag_kt, Length: 211848, dtype: int64

In [462]:
diagnoses_before = diagnoses_before.groupby(['idx', 'DATE', 'DIAGNOSIS']).days_btw_diag_kt.min().reset_index()
# diagnoses_before_['diag_number'] = diagnoses_before.groupby(['idx', 'DATE', 'DIAGNOSIS']).days_btw_diag_kt.count().values
# diagnoses_before = diagnoses_before_

In [463]:
diagnoses_res = pd.get_dummies(diagnoses_before.DIAGNOSIS)
diagnoses_res.values[diagnoses_res != 0] = diagnoses_before.days_btw_diag_kt
diagnoses_res[['idx', 'DATE']] = diagnoses_before[['idx', 'DATE']]

In [464]:
diagnoses_res = diagnoses_res.groupby(['idx', 'DATE']).sum().reset_index()

In [465]:
data_train.shape[0]

131989

In [466]:
diagnoses_res.DATE = pd.to_datetime(diagnoses_res.DATE)
data_train.DATE = pd.to_datetime(data_train.DATE)
data_test.DATE = pd.to_datetime(data_test.DATE)

In [467]:
data_train = pd.merge(data_train, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_train[diagnoses_res.columns[2:]] = data_train[diagnoses_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_test[diagnoses_res.columns[2:]] = data_test[diagnoses_res.columns[2:]].fillna(0)

In [468]:
print(data_train.shape)
print(data_test.shape)

(131989, 619)
(33080, 619)


### diag after KT

In [469]:
diagnoses_after = diagnoses.copy()
diagnoses_after['days_btw_diag_kt'] = (diagnoses_after.DIAG_DATE - diagnoses_after.DATE).apply(lambda x: x.days)
diagnoses_after = diagnoses_after[diagnoses_after.days_btw_diag_kt > 0].reset_index(drop=True)

In [470]:
diagnoses_after.DIAGNOSIS.nunique()

248

In [471]:
diagnoses_after.shape

(192665, 5)

In [472]:
diagnoses_after = diagnoses_after.groupby(['idx', 'DATE', 'DIAGNOSIS']).days_btw_diag_kt.min().reset_index()

In [473]:
diagnoses_res = pd.get_dummies(diagnoses_after.DIAGNOSIS)
diagnoses_res.columns = [x+'_after' for x in diagnoses_res.columns]
diagnoses_res.values[diagnoses_res != 0] = diagnoses_after.days_btw_diag_kt
diagnoses_res[['idx', 'DATE']] = diagnoses_after[['idx', 'DATE']]

In [474]:
diagnoses_res = diagnoses_res.groupby(['idx', 'DATE']).sum().reset_index()

In [475]:
data_train.shape[0]

131989

In [476]:
# diagnoses_res.DATE = diagnoses_res.DATE.astype(str)

In [477]:
data_train = pd.merge(data_train, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_train[diagnoses_res.columns[2:]] = data_train[diagnoses_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_test[diagnoses_res.columns[2:]] = data_test[diagnoses_res.columns[2:]].fillna(0)

In [478]:
print(data_train.shape)
print(data_test.shape)

(131989, 867)
(33080, 867)


### diag in KT

In [479]:
diagnoses_in = diagnoses.copy()
diagnoses_in['days_btw_diag_kt'] = (diagnoses_in.DIAG_DATE - diagnoses_in.DATE).apply(lambda x: x.days)
diagnoses_in = diagnoses_in[diagnoses_in.days_btw_diag_kt == 0].reset_index(drop=True)

In [480]:
diagnoses_in.DIAGNOSIS.nunique()

171

In [481]:
diagnoses_in.shape

(62011, 5)

In [482]:
diagnoses_in = diagnoses_in.groupby(['idx', 'DATE', 'DIAGNOSIS']).days_btw_diag_kt.count().reset_index()

In [483]:
diagnoses_res = pd.get_dummies(diagnoses_in.DIAGNOSIS)
diagnoses_res.columns = [x+'_in' for x in diagnoses_res.columns]
diagnoses_res.values[diagnoses_res != 0] = diagnoses_in.days_btw_diag_kt
diagnoses_res[['idx', 'DATE']] = diagnoses_in[['idx', 'DATE']]

In [484]:
diagnoses_res = diagnoses_res.groupby(['idx', 'DATE']).sum().reset_index()

In [485]:
data_train.shape[0]

131989

In [486]:
# diagnoses_res.DATE = diagnoses_res.DATE.astype(str)

In [487]:
data_train = pd.merge(data_train, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_train[diagnoses_res.columns[2:]] = data_train[diagnoses_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_test[diagnoses_res.columns[2:]] = data_test[diagnoses_res.columns[2:]].fillna(0)

In [488]:
print(data_train.shape)
print(data_test.shape)

(131989, 1038)
(33080, 1038)


### diag in week before/after KT

In [489]:
diagnoses_week = diagnoses.copy()
diagnoses_week['days_btw_diag_kt'] = np.abs((diagnoses_week.DIAG_DATE - diagnoses_week.DATE).apply(lambda x: x.days))
diagnoses_week = diagnoses_week[diagnoses_week.days_btw_diag_kt <= 7].reset_index(drop=True)

In [490]:
diagnoses_week.DIAGNOSIS.nunique()

223

In [491]:
diagnoses_week.shape

(170143, 5)

In [492]:
diagnoses_week = diagnoses_week.groupby(['idx', 'DATE', 'DIAGNOSIS']).days_btw_diag_kt.count().reset_index()

In [493]:
diagnoses_res = pd.get_dummies(diagnoses_week.DIAGNOSIS)
diagnoses_res.columns = [x+'_week' for x in diagnoses_res.columns]
diagnoses_res.values[diagnoses_res != 0] = diagnoses_week.days_btw_diag_kt
diagnoses_res[['idx', 'DATE']] = diagnoses_week[['idx', 'DATE']]

In [494]:
diagnoses_res = diagnoses_res.groupby(['idx', 'DATE']).sum().reset_index()

In [495]:
data_train.shape[0]

131989

In [496]:
# diagnoses_res.DATE = diagnoses_res.DATE.astype(str)

In [497]:
data_train = pd.merge(data_train, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_train[diagnoses_res.columns[2:]] = data_train[diagnoses_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_test[diagnoses_res.columns[2:]] = data_test[diagnoses_res.columns[2:]].fillna(0)

In [498]:
print(data_train.shape)
print(data_test.shape)

(131989, 1261)
(33080, 1261)


### diag in month before/after KT

In [499]:
diagnoses_month = diagnoses.copy()
diagnoses_month['days_btw_diag_kt'] = np.abs((diagnoses_month.DIAG_DATE - diagnoses_month.DATE).apply(lambda x: x.days))
diagnoses_month = diagnoses_month[diagnoses_month.days_btw_diag_kt <= 30].reset_index(drop=True)

In [500]:
diagnoses_month.DIAGNOSIS.nunique()

254

In [501]:
diagnoses_month.shape

(314100, 5)

In [502]:
diagnoses_month = diagnoses_month.groupby(['idx', 'DATE', 'DIAGNOSIS']).days_btw_diag_kt.count().reset_index()

In [503]:
diagnoses_res = pd.get_dummies(diagnoses_month.DIAGNOSIS)
diagnoses_res.columns = [x+'_month' for x in diagnoses_res.columns]
diagnoses_res.values[diagnoses_res != 0] = diagnoses_month.days_btw_diag_kt
diagnoses_res[['idx', 'DATE']] = diagnoses_month[['idx', 'DATE']]

In [504]:
diagnoses_res = diagnoses_res.groupby(['idx', 'DATE']).sum().reset_index()

In [505]:
data_train.shape[0]

131989

In [506]:
# diagnoses_res.DATE = diagnoses_res.DATE.astype(str)

In [507]:
data_train = pd.merge(data_train, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_train[diagnoses_res.columns[2:]] = data_train[diagnoses_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, diagnoses_res, on=['idx', 'DATE'], how='left')
# data_test[diagnoses_res.columns[2:]] = data_test[diagnoses_res.columns[2:]].fillna(0)

In [508]:
print(data_train.shape)
print(data_test.shape)

(131989, 1515)
(33080, 1515)


## Работа с drugs

In [510]:
# 7. Файл с назначенными препаратами (не использовался)
# IDX - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# DATE - Дата диагноза
# RECIPE - Назначенные препараты
drugs = pd.read_csv('data/data_for_science_drugs_2020-07-27.csv', sep=';')
drugs.head(1)

Unnamed: 0,idx,DATE,RECIPE
0,598671,2020-04-20,Рецепт 00Д4510052223: Azithromycinum - Применя...


In [511]:
# https://portal.eaeunion.org/sites/odata/redesign/Pages/InternationalNonProprietaryCodeClassifier.aspx
drugs_names = pd.read_csv('data/drugs_names.csv', sep=';')
drugs_names = drugs_names[drugs_names.columns[1:4]]
drugs_names.columns = ['rus', 'eng', 'latin']
drugs_names.head(1)

Unnamed: 0,rus,eng,latin
0,абаговомаб,abagovomab,abagovomabum


In [512]:
eng_rus_map = drugs_names[['rus', 'eng']].set_index('eng').to_dict()['rus']
latin_rus_map = drugs_names[['rus', 'latin']].set_index('latin').to_dict()['rus']

In [513]:
all_idx = list(pd.concat([data_train['idx'], data_test['idx']]).unique())

In [514]:
drugs = drugs[drugs.idx.isin(all_idx)].reset_index(drop=True)
drugs.RECIPE = drugs.RECIPE.apply(lambda x: x.lower())

In [515]:
drugs = pd.merge(drugs.rename(columns={'DATE': 'drugs_date'}), 
                 pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]),
                 on='idx', how='left')
drugs = drugs[drugs.DATE.notnull()].reset_index(drop=True)

drugs.drugs_date = pd.to_datetime(drugs.drugs_date)
drugs.DATE = pd.to_datetime(drugs.DATE)

drugs['days_btw_drug_kt'] = (drugs.DATE - drugs.drugs_date).apply(lambda x: x.days)

In [516]:
def get_drugs(x):
    res = []
    
    if 'рецепт' in x:
        x = x.split(': ')[1].split(' - ')[0]
        for word in x.split('+'):
            if word in latin_rus_map.keys():
                res.append(latin_rus_map[word])
            elif word in eng_rus_map.keys():
                res.append(eng_rus_map[word])
            elif word in latin_rus_map.values():
                res.append(word)
                
    else:
        
        x = x.split('; ')
        
        for x_ in x:
            if '+' in x_:
                x_ = x_.split('/')[0].split('+')
                for word in x_:
                    if word in latin_rus_map.values():
                        res.append(word)
            else:
                x_ = x_.split(' ')[0].split('/')
                for word in x_:
                    if word in latin_rus_map.values():
                        res.append(word)
    return list(set(res))

drugs['drug_list'] = drugs.RECIPE.apply(lambda x: get_drugs(x))

In [517]:
drugs_columns = []
for dl in drugs.drug_list:
    for dl_ in dl:
        if dl_ not in drugs_columns:
            drugs_columns.append(dl_)

In [518]:
len(drugs_columns)

152

### all drugs

In [519]:
drugs

Unnamed: 0,idx,drugs_date,RECIPE,DATE,days_btw_drug_kt,drug_list
0,598671,2020-04-20,рецепт 00д4510052223: azithromycinum - применя...,2020-04-20,0,[азитромицин]
1,596802,2020-04-20,рецепт 00д4510051283: azithromycinum - применя...,2020-04-20,0,[азитромицин]
2,528107,2020-04-20,рецепт 00д4510052318: levofloxacinum - применя...,2020-04-20,0,[левофлоксацин]
3,528107,2020-04-20,рецепт 00д4510052318: levofloxacinum - применя...,2020-04-24,4,[левофлоксацин]
4,528107,2020-04-20,рецепт 00д4510052318: levofloxacinum - применя...,2020-06-30,71,[левофлоксацин]
...,...,...,...,...,...,...
155867,760060,2020-07-13,рецепт 00д4510824134: азитромицин - применять ...,2020-05-18,-56,[азитромицин]
155868,1238902,2020-07-13,левофлоксацин/левофлокс табл. п/пл/об. 500 мг ...,2020-06-24,-19,[левофлоксацин]
155869,1238902,2020-07-13,рецепт 00д4510819971: levofloxacinum - применя...,2020-06-24,-19,[левофлоксацин]
155870,480393,2020-07-13,рецепт 00д4510824943: levofloxacinum - применя...,2020-06-24,-19,[левофлоксацин]


In [529]:
drugs_res = drugs[['idx', 'DATE']].copy()
drugs_res.loc[:, ['before_'+x for x in drugs_columns]] = 0

for i, (dl, db) in enumerate(zip(drugs.drug_list, drugs.days_btw_drug_kt)):
    for dl_ in dl:
        drugs_res.loc[i, 'before_'+dl_] += 1

In [530]:
drugs_res = drugs_res.groupby(['idx', 'DATE']).sum().reset_index().replace(0, np.nan)

In [532]:
drugs_res.DATE = pd.to_datetime(drugs_res.DATE)
data_train.DATE = pd.to_datetime(data_train.DATE)
data_test.DATE = pd.to_datetime(data_test.DATE)

In [534]:
data_train = pd.merge(data_train, drugs_res, on=['idx', 'DATE'], how='left')
# data_train[drugs_res.columns[2:]] = data_train[drugs_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, drugs_res, on=['idx', 'DATE'], how='left')
# data_test[drugs_res.columns[2:]] = data_test[drugs_res.columns[2:]].fillna(0)

In [535]:
print(data_train.shape)
print(data_test.shape)

(131989, 1667)
(33080, 1667)


### drugs before KT

In [536]:
drugs_before = drugs[drugs.days_btw_drug_kt > 0].reset_index(drop=True)

In [537]:
drugs_res = drugs_before[['idx', 'DATE']].copy()
drugs_res.loc[:, ['before_'+x for x in drugs_columns]] = 999

for i, (dl, db) in enumerate(zip(drugs_before.drug_list, drugs_before.days_btw_drug_kt)):
    for dl_ in dl:
        drugs_res.loc[i, 'before_'+dl_] = db

In [538]:
drugs_res = drugs_res.groupby(['idx', 'DATE']).min().reset_index().replace(999, np.nan)

In [539]:
drugs_res.DATE = pd.to_datetime(drugs_res.DATE)
data_train.DATE = pd.to_datetime(data_train.DATE)
data_test.DATE = pd.to_datetime(data_test.DATE)

In [540]:
data_train = pd.merge(data_train, drugs_res, on=['idx', 'DATE'], how='left')
# data_train[drugs_res.columns[2:]] = data_train[drugs_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, drugs_res, on=['idx', 'DATE'], how='left')
# data_test[drugs_res.columns[2:]] = data_test[drugs_res.columns[2:]].fillna(0)

In [541]:
print(data_train.shape)
print(data_test.shape)

(131989, 1819)
(33080, 1819)


### drugs after KT

In [542]:
drugs_after = drugs[drugs.days_btw_drug_kt < 0].reset_index(drop=True)

In [543]:
drugs_res = drugs_after[['idx', 'DATE']].copy()
drugs_res.loc[:, ['after_'+x for x in drugs_columns]] = 999

for i, (dl, db) in enumerate(zip(drugs_after.drug_list, drugs_after.days_btw_drug_kt)):
    for dl_ in dl:
        drugs_res.loc[i, 'after_'+dl_] = abs(db)

In [544]:
drugs_res = drugs_res.groupby(['idx', 'DATE']).min().reset_index().replace(999, np.nan)

In [545]:
drugs_res.DATE = pd.to_datetime(drugs_res.DATE)
# data_train.DATE = pd.to_datetime(data_train.DATE)
# data_test.DATE = pd.to_datetime(data_test.DATE)

In [546]:
data_train = pd.merge(data_train, drugs_res, on=['idx', 'DATE'], how='left')
# data_train[drugs_res.columns[2:]] = data_train[drugs_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, drugs_res, on=['idx', 'DATE'], how='left')
# data_test[drugs_res.columns[2:]] = data_test[drugs_res.columns[2:]].fillna(0)

In [547]:
print(data_train.shape)
print(data_test.shape)

(131989, 1971)
(33080, 1971)


### drugs in KT

In [548]:
drugs_in = drugs[drugs.days_btw_drug_kt == 0].reset_index(drop=True)

In [549]:
drugs_res = drugs_in[['idx', 'DATE']].copy()
drugs_res.loc[:, ['in_'+x for x in drugs_columns]] = 0

for i, dl in enumerate(drugs_in.drug_list):
    for dl_ in dl:
        drugs_res.loc[i, 'in_'+dl_] += 1

In [550]:
drugs_res = drugs_res.groupby(['idx', 'DATE']).sum().reset_index()

In [551]:
drugs_res.DATE = pd.to_datetime(drugs_res.DATE)
# data_train.DATE = pd.to_datetime(data_train.DATE)
# data_test.DATE = pd.to_datetime(data_test.DATE)

In [552]:
data_train = pd.merge(data_train, drugs_res, on=['idx', 'DATE'], how='left')
# data_train[drugs_res.columns[2:]] = data_train[drugs_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, drugs_res, on=['idx', 'DATE'], how='left')
# data_test[drugs_res.columns[2:]] = data_test[drugs_res.columns[2:]].fillna(0)

In [553]:
print(data_train.shape)
print(data_test.shape)

(131989, 2123)
(33080, 2123)


In [554]:
# data_train.to_csv('temp_data/data_train.csv', index=False)
# data_test.to_csv('temp_data/data_test.csv', index=False)

### drugs in week KT

In [555]:
drugs_week = drugs[np.abs(drugs.days_btw_drug_kt) <= 7].reset_index(drop=True)

In [556]:
drugs_res = drugs_week[['idx', 'DATE']].copy()
drugs_res.loc[:, ['week_'+x for x in drugs_columns]] = 999

for i, (dl, db) in enumerate(zip(drugs_week.drug_list, drugs_week.days_btw_drug_kt)):
    for dl_ in dl:
        drugs_res.loc[i, 'week_'+dl_] = abs(db)

In [557]:
drugs_res = drugs_res.groupby(['idx', 'DATE']).min().reset_index().replace(999, np.nan)

In [558]:
drugs_res.DATE = pd.to_datetime(drugs_res.DATE)
# data_train.DATE = pd.to_datetime(data_train.DATE)
# data_test.DATE = pd.to_datetime(data_test.DATE)

In [559]:
data_train = pd.merge(data_train, drugs_res, on=['idx', 'DATE'], how='left')
# data_train[drugs_res.columns[2:]] = data_train[drugs_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, drugs_res, on=['idx', 'DATE'], how='left')
# data_test[drugs_res.columns[2:]] = data_test[drugs_res.columns[2:]].fillna(0)

In [560]:
print(data_train.shape)
print(data_test.shape)

(131989, 2275)
(33080, 2275)


### drugs in month KT

In [561]:
drugs_month = drugs[np.abs(drugs.days_btw_drug_kt) <= 30].reset_index(drop=True)

In [562]:
drugs_res = drugs_month[['idx', 'DATE']].copy()
drugs_res.loc[:, ['month_'+x for x in drugs_columns]] = 999

for i, (dl, db) in enumerate(zip(drugs_month.drug_list, drugs_month.days_btw_drug_kt)):
    for dl_ in dl:
        drugs_res.loc[i, 'month_'+dl_] = abs(db)

In [563]:
drugs_res = drugs_res.groupby(['idx', 'DATE']).min().reset_index().replace(999, np.nan)

In [564]:
drugs_res.DATE = pd.to_datetime(drugs_res.DATE)
# data_train.DATE = pd.to_datetime(data_train.DATE)
# data_test.DATE = pd.to_datetime(data_test.DATE)

In [565]:
data_train = pd.merge(data_train, drugs_res, on=['idx', 'DATE'], how='left')
# data_train[drugs_res.columns[2:]] = data_train[drugs_res.columns[2:]].fillna(0)

data_test = pd.merge(data_test, drugs_res, on=['idx', 'DATE'], how='left')
# data_test[drugs_res.columns[2:]] = data_test[drugs_res.columns[2:]].fillna(0)

In [566]:
print(data_train.shape)
print(data_test.shape)

(131989, 2427)
(33080, 2427)


## Работа с covid

- dis_date добавлю позже

In [567]:
# 1.Файл с данными результатов обследований на Ковид (описаны значимые поля):
# IDX - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# gender - пол пациента
# birth_dt - Дата рождения
# dis_date - Дата начала заболевания (со слов пациента или дата обращения за анализом)
# samples_type_id - вид биоматериала
# get_date - дата взятия биоматериала
# IgG, IgM - значения уровня иммуноглобулинов (для биоматериала Кровь)
# Samples_result_id - значение Обнаружено/не обнаружено (для биоматериала Мазок)
# Mu_type - тип медучреждения, где выполнялось взятие биоматериала
covid = pd.read_csv('data/data_for_science_2020-07-27_w_mu_type.csv', sep=';')
covid.head(1)

Unnamed: 0,idx,pd_id,person_id,ds_id,gender,birth_dt,mkb10_code,mkb10_name,dis_date,samples_type_id,get_date_at,receive_date_at,result_date_at,IgG,IgM,samples_result_id,mu_type
0,0,1562717,1460618,1652919,М,1995-11-26,Z11.5,Специальное скрининговое обследование с целью ...,2020-05-17,"Кровь, цельная (сыворотка)",2020-05-17,2020-05-17,2020-05-17,0.18,0.55,,Городская поликлиника


### Обрабатываю информацию о возрасте и поле

In [568]:
personal_info = covid[['idx', 'gender', 'birth_dt']].copy().drop_duplicates('idx')
personal_info = pd.concat([personal_info, pd.get_dummies(personal_info.gender)], axis=1).drop(['gender'], axis=1)

In [569]:
data_train = pd.merge(data_train, personal_info,
                      on='idx', how='left')
data_train['DATE'] = pd.to_datetime(data_train.DATE)
data_train['birth_dt'] = pd.to_datetime(data_train.birth_dt)

data_train['age'] = data_train[['DATE', 'birth_dt']].apply(lambda x: relativedelta(x[0], x[1]).years, axis=1)
data_train = data_train.drop(['birth_dt'], axis=1)

In [570]:
data_test = pd.merge(data_test, personal_info,
                     on='idx', how='left')
data_test['DATE'] = pd.to_datetime(data_test.DATE)
data_test['birth_dt'] = pd.to_datetime(data_test.birth_dt)

data_test['age'] = data_test[['DATE', 'birth_dt']].apply(lambda x: relativedelta(x[0], x[1]).years, axis=1)
data_test = data_test.drop(['birth_dt'], axis=1)

In [571]:
print(data_train.shape)
print(data_test.shape)

(131989, 2430)
(33080, 2430)


### Добавляю длительность болезни до КТ

In [572]:
dis_date_data = pd.merge(covid[['idx', 'dis_date']],
                         pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]), 
                         on='idx', how='left')

def to_datetime(x):
    try:
        return pd.to_datetime(x)
    except:
        return np.nan
dis_date_data.dis_date = dis_date_data.dis_date.apply(lambda x: to_datetime(x))

In [573]:
dis_date_data_before = dis_date_data[(dis_date_data.DATE.notnull())
                                     &(dis_date_data.dis_date < dis_date_data.DATE)].reset_index(drop=True)

In [574]:
dis_date_data_before['dis_duration'] = (dis_date_data_before.DATE 
                                        - dis_date_data_before.dis_date).apply(lambda x: x.days)

In [575]:
dis_date_res = dis_date_data_before.groupby(['idx', 'DATE']).dis_duration.mean().reset_index() \
                            .rename(columns={'dis_duration': 'dis_duration_mean'})

dis_date_res['dis_duration_min'] = dis_date_data_before.groupby(['idx', 'DATE']).dis_duration.min().values
dis_date_res['dis_duration_max'] = dis_date_data_before.groupby(['idx', 'DATE']).dis_duration.max().values

In [576]:
dis_date_res

Unnamed: 0,idx,DATE,dis_duration_mean,dis_duration_min,dis_duration_max
0,6,2020-05-16,9.500000,9,10
1,11,2020-05-16,9.333333,9,10
2,21,2020-05-16,14.000000,14,14
3,28,2020-05-16,17.666667,17,19
4,51,2020-05-16,6.000000,6,6
...,...,...,...,...,...
131494,1297335,2020-07-09,8.000000,8,8
131495,1297345,2020-07-03,1.000000,1,1
131496,1297357,2020-07-05,3.000000,3,3
131497,1297453,2020-07-02,1.000000,1,1


In [577]:
data_train = pd.merge(data_train, dis_date_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, dis_date_res, on=['idx', 'DATE'], how='left')

In [578]:
print(data_train.shape)
print(data_test.shape)

(131989, 2433)
(33080, 2433)


### Добавляю время до болезни после КТ

In [579]:
dis_date_data_after = dis_date_data[(dis_date_data.DATE.notnull())
                                     &(dis_date_data.dis_date >= dis_date_data.DATE)].reset_index(drop=True)

In [580]:
dis_date_data_after['dis_duration'] = (dis_date_data_after.DATE 
                                        - dis_date_data_after.dis_date).apply(lambda x: x.days)

In [581]:
dis_date_res = dis_date_data_after.groupby(['idx', 'DATE']).dis_duration.mean().reset_index() \
                            .rename(columns={'dis_duration': 'after_dis_duration_mean'})

dis_date_res['after_dis_duration_min'] = dis_date_data_after.groupby(['idx', 'DATE']).dis_duration.min().values
dis_date_res['after_dis_duration_max'] = dis_date_data_after.groupby(['idx', 'DATE']).dis_duration.max().values

In [582]:
dis_date_res

Unnamed: 0,idx,DATE,after_dis_duration_mean,after_dis_duration_min,after_dis_duration_max
0,6,2020-05-16,0.00,0,0
1,11,2020-05-16,0.00,0,0
2,28,2020-05-16,-21.00,-21,-21
3,53,2020-05-05,-16.00,-43,-8
4,60,2020-05-16,-9.25,-18,-3
...,...,...,...,...,...
108910,1297419,2020-07-01,0.00,0,0
108911,1297421,2020-07-01,0.00,0,0
108912,1297424,2020-07-01,0.00,0,0
108913,1297498,2020-07-02,0.00,0,0


In [583]:
data_train = pd.merge(data_train, dis_date_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, dis_date_res, on=['idx', 'DATE'], how='left')

In [584]:
print(data_train.shape)
print(data_test.shape)

(131989, 2436)
(33080, 2436)


### Разделение covid

In [585]:
# 1.Файл с данными результатов обследований на Ковид (описаны значимые поля):
# IDX - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# gender - пол пациента
# birth_dt - Дата рождения
# dis_date - Дата начала заболевания (со слов пациента или дата обращения за анализом)
# samples_type_id - вид биоматериала
# get_date - дата взятия биоматериала
# IgG, IgM - значения уровня иммуноглобулинов (для биоматериала Кровь)
# Samples_result_id - значение Обнаружено/не обнаружено (для биоматериала Мазок)
# Mu_type - тип медучреждения, где выполнялось взятие биоматериала
# covid = pd.read_csv('data/data_for_science_2020-07-27_w_mu_type.csv', sep=';')
covid_data = covid[['idx', 'samples_type_id', 
                    'get_date_at', 'IgG', 'IgM', 
                    'samples_result_id', 'mu_type']].copy()

covid_data = covid_data[covid_data.idx.isin(pd.concat([data_train, data_test]).idx.unique())].reset_index(drop=True)

covid_data['get_date_at'] = pd.to_datetime(covid_data['get_date_at'])

In [586]:
kt_train[kt_train.idx == kt_train.idx.unique()[4]]

Unnamed: 0,idx,DATE,KT_RESULT
4,689401,2020-04-18,КТ-2
23156,689401,2020-05-20,КТ-0


In [587]:
covid_data[covid_data.idx == 689401].sort_values('get_date_at')

Unnamed: 0,idx,samples_type_id,get_date_at,IgG,IgM,samples_result_id,mu_type
424911,689401,Мазок/отделяемое из носоглотки и ротоглотки,2020-04-29,,,Не обнаружено,Городская поликлиника


In [588]:
covid_data.samples_type_id.unique()

array(['Кровь, цельная (сыворотка)',
       'Мазок/отделяемое из носоглотки и ротоглотки',
       'Аутопсийный материал', 'Аспират из трахеи',
       'Бронхоальвеолярный лаваж', 'Мокрота'], dtype=object)

In [589]:
covid_data.mu_type.unique()

array(['Городская поликлиника', 'Городская больница',
       'Станция скорой помощи', 'Иное', 'Поликлиника', 'Больница',
       'Федеральная медицинская организация', 'Центр',
       'Детская городская поликлиника', 'КВД', 'Диспансер'], dtype=object)

In [590]:
# 'Кровь, цельная (сыворотка)' - IgG и IgM только для этого анализа
# При этом samples_result_id всегда нулевой

# А для остальных анализов samples_result_id всегда одно из трех значений 
# ['Не обнаружено', 'ОБНАРУЖЕНО', 'Подозрительный']

covid_data_IgGM = covid_data[covid_data.samples_type_id == 'Кровь, цельная (сыворотка)'].reset_index(drop=True)
covid_data_IgGM = covid_data_IgGM.drop(['samples_type_id', 'samples_result_id'], axis=1)

covid_data_other = covid_data[covid_data.samples_type_id != 'Кровь, цельная (сыворотка)'].reset_index(drop=True)
covid_data_other = covid_data_other.drop(['IgG', 'IgM'], axis=1)

In [591]:
covid_data_IgGM.head()

Unnamed: 0,idx,get_date_at,IgG,IgM,mu_type
0,6,2020-05-16,8.38,0.93,Городская поликлиника
1,11,2020-05-16,3.85,0.74,Городская поликлиника
2,21,2020-05-16,0.63,0.36,Городская поликлиника
3,28,2020-06-18,0.1,0.21,Городская поликлиника
4,28,2020-05-16,0.09,0.25,Городская поликлиника


In [592]:
covid_data_other.head()

Unnamed: 0,idx,samples_type_id,get_date_at,samples_result_id,mu_type
0,6,Мазок/отделяемое из носоглотки и ротоглотки,2020-05-07,ОБНАРУЖЕНО,Городская поликлиника
1,6,Мазок/отделяемое из носоглотки и ротоглотки,2020-05-16,Не обнаружено,Городская поликлиника
2,6,Мазок/отделяемое из носоглотки и ротоглотки,2020-05-19,Не обнаружено,Городская поликлиника
3,11,Мазок/отделяемое из носоглотки и ротоглотки,2020-05-07,ОБНАРУЖЕНО,Городская поликлиника
4,11,Мазок/отделяемое из носоглотки и ротоглотки,2020-05-16,Не обнаружено,Городская поликлиника


### covid_data_IgGM

In [593]:
covid_data_IgGM.shape

(119213, 5)

In [594]:
covid_data_IgGM = pd.merge(covid_data_IgGM, 
                           pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]), 
                           on='idx', how='left')

#### IgGM before

In [595]:
covid_data_IgGM_before = covid_data_IgGM[covid_data_IgGM.get_date_at < pd.to_datetime(covid_data_IgGM.DATE)].reset_index(drop=True)

In [596]:
covid_data_IgGM_before.shape

(19237, 6)

In [597]:
# covid_data_IgGM_good = covid_data_IgGM_before.copy()
# covid_data_IgGM_good['get_date_at'] = pd.to_datetime(covid_data_IgGM_good['get_date_at'])
# covid_data_IgGM_good['DATE'] = pd.to_datetime(covid_data_IgGM_good['DATE'])
# covid_data_IgGM_good['date_diff'] = (covid_data_IgGM_good.DATE - covid_data_IgGM_good.get_date_at).apply(lambda x: x.days)

In [598]:
# covid_data_IgGM_good_ = covid_data_IgGM_good.groupby(['idx', 'DATE']).date_diff.min().reset_index()
# covid_data_IgGM_good_.shape

In [599]:
# covid_data_IgGM_good = pd.merge(covid_data_IgGM_good_, covid_data_IgGM_good, 
#                                 on=['idx', 'DATE', 'date_diff'], how='left')

In [600]:
# covid_data_IgGM_good.shape

In [601]:
covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']).agg({'IgG': 'mean', 'IgM': 'mean'}).reset_index()

Unnamed: 0,idx,DATE,mu_type,IgG,IgM
0,60,2020-05-17,Городская поликлиника,1.23,0.43
1,92,2020-06-29,Городская поликлиника,0.19,2.38
2,160,2020-05-19,Городская поликлиника,0.27,1.43
3,173,2020-05-17,Городская поликлиника,136.41,6.48
4,177,2020-06-06,Городская поликлиника,122.39,3.39
...,...,...,...,...,...
16082,547862,2020-07-07,Городская поликлиника,2.64,3.79
16083,547865,2020-07-10,Городская поликлиника,26.66,6.21
16084,548482,2020-07-04,Городская поликлиника,1.11,0.35
16085,548541,2020-07-05,Городская поликлиника,6.33,10.54


In [602]:
covid_data_IgGM_good_ = covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']) \
                                            .agg({'IgG': 'mean', 'IgM': 'mean'}).reset_index() \
                                            .rename(columns={'IgG': 'IgG_nearest_mean', 'IgM': 'IgM_nearest_mean'})

covid_data_IgGM_good_['IgG_min'] = covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']).IgG.min().values
covid_data_IgGM_good_['IgG_max'] = covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']).IgG.max().values

covid_data_IgGM_good_['IgM_min'] = covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']).IgM.min().values
covid_data_IgGM_good_['IgM_min'] = covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']).IgM.max().values

covid_data_IgGM_good_['IgG_mean'] = covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']).IgG.mean().values
covid_data_IgGM_good_['IgM_mean'] = covid_data_IgGM_before.groupby(['idx', 'DATE', 'mu_type']).IgM.mean().values

In [603]:
covid_data_IgGM_good_

Unnamed: 0,idx,DATE,mu_type,IgG_nearest_mean,IgM_nearest_mean,IgG_min,IgG_max,IgM_min,IgG_mean,IgM_mean
0,60,2020-05-17,Городская поликлиника,1.23,0.43,1.23,1.23,0.43,1.23,0.43
1,92,2020-06-29,Городская поликлиника,0.19,2.38,0.19,0.19,2.38,0.19,2.38
2,160,2020-05-19,Городская поликлиника,0.27,1.43,0.27,0.27,1.43,0.27,1.43
3,173,2020-05-17,Городская поликлиника,136.41,6.48,136.41,136.41,6.48,136.41,6.48
4,177,2020-06-06,Городская поликлиника,122.39,3.39,122.39,122.39,3.39,122.39,3.39
...,...,...,...,...,...,...,...,...,...,...
16082,547862,2020-07-07,Городская поликлиника,2.64,3.79,2.64,2.64,3.79,2.64,3.79
16083,547865,2020-07-10,Городская поликлиника,26.66,6.21,26.66,26.66,6.21,26.66,6.21
16084,548482,2020-07-04,Городская поликлиника,1.11,0.35,1.11,1.11,0.35,1.11,0.35
16085,548541,2020-07-05,Городская поликлиника,6.33,10.54,6.33,6.33,10.54,6.33,10.54


In [604]:
temp_reses = []

for col in covid_data_IgGM_good_.columns[3:]:

    temp_res = pd.get_dummies(covid_data_IgGM_good_.mu_type)
    temp_res.columns = [col+'_'+x for x in temp_res.columns]
    temp_res.values[temp_res != 0] = covid_data_IgGM_good_[col]
    temp_reses.append(temp_res)
    
covid_data_IgGM_res = pd.concat(temp_reses, axis=1)
covid_data_IgGM_res[['idx', 'DATE']] = covid_data_IgGM_good_[['idx', 'DATE']]

In [605]:
covid_data_IgGM_res = covid_data_IgGM_res.groupby(['idx', 'DATE']).max()

In [606]:
data_train = pd.merge(data_train, covid_data_IgGM_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, covid_data_IgGM_res, on=['idx', 'DATE'], how='left')

In [607]:
print(data_train.shape)
print(data_test.shape)

(131989, 2506)
(33080, 2506)


#### IgGM after

In [608]:
covid_data_IgGM_after = covid_data_IgGM[covid_data_IgGM.get_date_at > pd.to_datetime(covid_data_IgGM.DATE)] \
                                        .reset_index(drop=True)

In [609]:
covid_data_IgGM_after.shape

(78696, 6)

In [610]:
# covid_data_IgGM_good = covid_data_IgGM_after.copy()
# covid_data_IgGM_good['get_date_at'] = pd.to_datetime(covid_data_IgGM_good['get_date_at'])
# covid_data_IgGM_good['DATE'] = pd.to_datetime(covid_data_IgGM_good['DATE'])
# covid_data_IgGM_good['date_diff'] = (covid_data_IgGM_good.DATE - covid_data_IgGM_good.get_date_at).apply(lambda x: x.days)

In [611]:
# covid_data_IgGM_good_ = covid_data_IgGM_good.groupby(['idx', 'DATE']).date_diff.min().reset_index()
# covid_data_IgGM_good_.shape

In [612]:
# covid_data_IgGM_good = pd.merge(covid_data_IgGM_good_, covid_data_IgGM_good, 
#                                 on=['idx', 'DATE', 'date_diff'], how='left')

In [613]:
# covid_data_IgGM_good.shape

In [614]:
# covid_data_IgGM_good_ = covid_data_IgGM_good.groupby(['idx', 'DATE']) \
#                                             .agg({'IgG': 'mean', 'IgM': 'mean'}).reset_index() \
#                                             .rename(columns={'IgG': 'after_IgG_nearest_mean', 'IgM': 'after_IgM_nearest_mean'})

# covid_data_IgGM_good_['after_IgG_min'] = covid_data_IgGM_after.groupby(['idx', 'DATE']).IgG.min().values
# covid_data_IgGM_good_['after_IgG_max'] = covid_data_IgGM_after.groupby(['idx', 'DATE']).IgG.max().values

# covid_data_IgGM_good_['after_IgM_min'] = covid_data_IgGM_after.groupby(['idx', 'DATE']).IgM.min().values
# covid_data_IgGM_good_['after_IgM_min'] = covid_data_IgGM_after.groupby(['idx', 'DATE']).IgM.max().values

# covid_data_IgGM_good_['after_IgG_mean'] = covid_data_IgGM_after.groupby(['idx', 'DATE']).IgG.mean().values
# covid_data_IgGM_good_['after_IgM_mean'] = covid_data_IgGM_after.groupby(['idx', 'DATE']).IgM.mean().values

In [615]:
covid_data_IgGM_good_ = covid_data_IgGM_after.groupby(['idx', 'DATE', 'mu_type']) \
                                            .agg({'IgG': 'mean', 'IgM': 'mean'}).reset_index() \
                                            .rename(columns={'IgG': 'after_IgG_nearest_mean', 'IgM': 'after_IgM_nearest_mean'})

covid_data_IgGM_good_['after_IgG_min'] = covid_data_IgGM_after.groupby(['idx', 'DATE', 'mu_type']).IgG.min().values
covid_data_IgGM_good_['after_IgG_max'] = covid_data_IgGM_after.groupby(['idx', 'DATE', 'mu_type']).IgG.max().values

covid_data_IgGM_good_['after_IgM_min'] = covid_data_IgGM_after.groupby(['idx', 'DATE', 'mu_type']).IgM.min().values
covid_data_IgGM_good_['after_IgM_min'] = covid_data_IgGM_after.groupby(['idx', 'DATE', 'mu_type']).IgM.max().values

covid_data_IgGM_good_['after_IgG_mean'] = covid_data_IgGM_after.groupby(['idx', 'DATE', 'mu_type']).IgG.mean().values
covid_data_IgGM_good_['after_IgM_mean'] = covid_data_IgGM_after.groupby(['idx', 'DATE', 'mu_type']).IgM.mean().values

In [616]:
covid_data_IgGM_good_

Unnamed: 0,idx,DATE,mu_type,after_IgG_nearest_mean,after_IgM_nearest_mean,after_IgG_min,after_IgG_max,after_IgM_min,after_IgG_mean,after_IgM_mean
0,28,2020-05-16,Городская поликлиника,0.095,0.33,0.09,0.10,0.45,0.095,0.33
1,53,2020-05-05,Городская поликлиника,76.445,1.65,68.04,84.85,2.40,76.445,1.65
2,83,2020-05-15,Городская поликлиника,0.580,0.23,0.58,0.58,0.23,0.580,0.23
3,89,2020-05-17,Городская поликлиника,90.940,5.98,90.94,90.94,5.98,90.940,5.98
4,91,2020-05-17,Городская поликлиника,59.060,0.91,59.06,59.06,0.91,59.060,0.91
...,...,...,...,...,...,...,...,...,...,...
56911,548868,2020-06-18,Городская поликлиника,5.760,0.36,5.76,5.76,0.36,5.760,0.36
56912,548949,2020-06-30,Городская поликлиника,114.910,2.02,114.91,114.91,2.02,114.910,2.02
56913,548963,2020-05-08,Городская поликлиника,7.900,0.34,7.90,7.90,0.34,7.900,0.34
56914,548991,2020-05-04,Городская поликлиника,1.810,0.36,1.81,1.81,0.36,1.810,0.36


In [617]:
temp_reses = []

for col in covid_data_IgGM_good_.columns[3:]:

    temp_res = pd.get_dummies(covid_data_IgGM_good_.mu_type)
    temp_res.columns = [col+'_'+x for x in temp_res.columns]
    temp_res.values[temp_res != 0] = covid_data_IgGM_good_[col]
    temp_reses.append(temp_res)
    
covid_data_IgGM_res = pd.concat(temp_reses, axis=1)
covid_data_IgGM_res[['idx', 'DATE']] = covid_data_IgGM_good_[['idx', 'DATE']]

In [618]:
covid_data_IgGM_res = covid_data_IgGM_res.groupby(['idx', 'DATE']).max()

In [619]:
data_train = pd.merge(data_train, covid_data_IgGM_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, covid_data_IgGM_res, on=['idx', 'DATE'], how='left')

In [620]:
print(data_train.shape)
print(data_test.shape)

(131989, 2576)
(33080, 2576)


#### IgGM in

In [621]:
covid_data_IgGM_in = covid_data_IgGM[covid_data_IgGM.get_date_at == pd.to_datetime(covid_data_IgGM.DATE)] \
                                        .reset_index(drop=True)

In [622]:
covid_data_IgGM_good_ = covid_data_IgGM_in.groupby(['idx', 'DATE', 'mu_type']) \
                                            .agg({'IgG': 'mean', 'IgM': 'mean'}).reset_index() \
                                            .rename(columns={'IgG': 'in_IgG_nearest_mean', 'IgM': 'in_IgM_nearest_mean'})

covid_data_IgGM_good_['in_IgG_min'] = covid_data_IgGM_in.groupby(['idx', 'DATE', 'mu_type']).IgG.min().values
covid_data_IgGM_good_['in_IgG_max'] = covid_data_IgGM_in.groupby(['idx', 'DATE', 'mu_type']).IgG.max().values

covid_data_IgGM_good_['in_IgM_min'] = covid_data_IgGM_in.groupby(['idx', 'DATE', 'mu_type']).IgM.min().values
covid_data_IgGM_good_['in_IgM_min'] = covid_data_IgGM_in.groupby(['idx', 'DATE', 'mu_type']).IgM.max().values

covid_data_IgGM_good_['in_IgG_mean'] = covid_data_IgGM_in.groupby(['idx', 'DATE', 'mu_type']).IgG.mean().values
covid_data_IgGM_good_['in_IgM_mean'] = covid_data_IgGM_in.groupby(['idx', 'DATE', 'mu_type']).IgM.mean().values

In [623]:
covid_data_IgGM_good_

Unnamed: 0,idx,DATE,mu_type,in_IgG_nearest_mean,in_IgM_nearest_mean,in_IgG_min,in_IgG_max,in_IgM_min,in_IgG_mean,in_IgM_mean
0,6,2020-05-16,Городская поликлиника,8.38,0.93,8.38,8.38,0.93,8.38,0.93
1,11,2020-05-16,Городская поликлиника,3.85,0.74,3.85,3.85,0.74,3.85,0.74
2,21,2020-05-16,Городская поликлиника,0.63,0.36,0.63,0.63,0.36,0.63,0.36
3,28,2020-05-16,Городская поликлиника,0.09,0.25,0.09,0.09,0.25,0.09,0.25
4,51,2020-05-16,Городская поликлиника,1.38,0.21,1.38,1.38,0.21,1.38,0.21
...,...,...,...,...,...,...,...,...,...,...
54958,548476,2020-07-02,Городская поликлиника,2.69,0.54,2.69,2.69,0.54,2.69,0.54
54959,548566,2020-07-02,Городская поликлиника,0.93,1.71,0.93,0.93,1.71,0.93,1.71
54960,548710,2020-07-02,Городская поликлиника,158.88,0.77,158.88,158.88,0.77,158.88,0.77
54961,548712,2020-07-02,Городская поликлиника,89.47,1.81,89.47,89.47,1.81,89.47,1.81


In [624]:
temp_reses = []

for col in covid_data_IgGM_good_.columns[3:]:

    temp_res = pd.get_dummies(covid_data_IgGM_good_.mu_type)
    temp_res.columns = [col+'_'+x for x in temp_res.columns]
    temp_res.values[temp_res != 0] = covid_data_IgGM_good_[col]
    temp_reses.append(temp_res)
    
covid_data_IgGM_res = pd.concat(temp_reses, axis=1)
covid_data_IgGM_res[['idx', 'DATE']] = covid_data_IgGM_good_[['idx', 'DATE']]

In [625]:
covid_data_IgGM_res = covid_data_IgGM_res.groupby(['idx', 'DATE']).max()

In [626]:
data_train = pd.merge(data_train, covid_data_IgGM_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, covid_data_IgGM_res, on=['idx', 'DATE'], how='left')

In [627]:
print(data_train.shape)
print(data_test.shape)

(131989, 2625)
(33080, 2625)


### covid_data_other

In [628]:
samples_result_id_replace = {'ОБНАРУЖЕНО': 3, 'Не обнаружено': 0, 'Подозрительный': 1}
covid_data_other = covid_data_other.replace({'samples_result_id': samples_result_id_replace})

In [629]:
covid_data_other = pd.merge(covid_data_other, 
                            pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]), 
                            on='idx', how='left')

covid_data_other.get_date_at = pd.to_datetime(covid_data_other.get_date_at)
covid_data_other.DATE = pd.to_datetime(covid_data_other.DATE)
covid_data_other['date_diff'] = (covid_data_other.DATE - covid_data_other.get_date_at).apply(lambda x: x.days)

In [630]:
covid_data_other['date_diff'].describe()

count    486060.000000
mean         -4.913465
std          17.750540
min         -75.000000
25%         -15.000000
50%          -3.000000
75%           1.000000
max          82.000000
Name: date_diff, dtype: float64

#### other all

In [639]:
all_covid_data_other = covid_data_other.copy()

In [640]:
all_covid_data_other['name'] = all_covid_data_other.apply(lambda x: str(x[3])+'_'+str(x[1]), axis=1)

In [644]:
all_covid_data_other = all_covid_data_other.groupby(['idx', 'DATE', 'name']).samples_result_id.count().reset_index()

In [645]:
all_covid_data_other.head()

Unnamed: 0,idx,DATE,name,samples_result_id
0,6,2020-05-16,0_Мазок/отделяемое из носоглотки и ротоглотки,2
1,6,2020-05-16,3_Мазок/отделяемое из носоглотки и ротоглотки,1
2,11,2020-05-16,0_Мазок/отделяемое из носоглотки и ротоглотки,2
3,11,2020-05-16,3_Мазок/отделяемое из носоглотки и ротоглотки,1
4,21,2020-05-16,0_Мазок/отделяемое из носоглотки и ротоглотки,1


In [649]:
temp_res = pd.get_dummies(all_covid_data_other.name)
temp_res.values[temp_res != 0] = all_covid_data_other.samples_result_id

temp_res[['idx', 'DATE']] = all_covid_data_other[['idx', 'DATE']]

In [652]:
temp_res = temp_res.groupby(['idx', 'DATE']).sum().reset_index()

In [654]:
data_train = pd.merge(data_train, temp_res, on=['idx', 'DATE'], how='left')

data_test = pd.merge(data_test, temp_res, on=['idx', 'DATE'], how='left')

In [655]:
print(data_train.shape)
print(data_test.shape)

(131989, 2635)
(33080, 2635)


#### other before

In [656]:
covid_data_other_before = covid_data_other[covid_data_other.date_diff > 0].reset_index(drop=True)

In [657]:
# covid_data_other_good = covid_data_other_before.groupby(['idx', 'DATE', 'samples_type_id']).date_diff.min().reset_index()
# covid_data_other_good = pd.merge(covid_data_other_good, covid_data_other_before,
#                                  on=['idx', 'DATE', 'samples_type_id', 'date_diff'], how='left')

In [658]:
covid_data_other_good = covid_data_other_before.groupby(['idx', 'DATE', 'samples_type_id']) \
                        .samples_result_id.max().reset_index().rename(columns={'samples_result_id': 'samples_result_id_max'})

covid_data_other_good['samples_result_id_min'] = covid_data_other_before.groupby(['idx', 'DATE', 'samples_type_id']) \
                                                                        .samples_result_id.min().values

covid_data_other_good['samples_result_id_count'] = covid_data_other_before.groupby(['idx', 'DATE', 'samples_type_id']) \
                                                                        .samples_result_id.count().values

In [659]:
temp_reses = []

for col in covid_data_other_good.columns[3:]:
    temp_res = pd.get_dummies(covid_data_other_good.samples_type_id)
    temp_res.values[temp_res != 0] = covid_data_other_good[col]
    temp_res.columns = ['before_covid_{}_'.format(col)+x for x in temp_res.columns]
    temp_reses.append(temp_res)
    
covid_data_other_res = pd.concat(temp_reses, axis=1)

covid_data_other_res[['idx', 'DATE']] = covid_data_other_good[['idx', 'DATE']]

In [660]:
covid_data_other_res.shape

(76512, 11)

In [661]:
covid_data_other_res = covid_data_other_res.groupby(['idx', 'DATE']).max().reset_index()

In [662]:
data_train = pd.merge(data_train, covid_data_other_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, covid_data_other_res, on=['idx', 'DATE'], how='left')

In [663]:
print(data_train.shape)
print(data_test.shape)

(131989, 2644)
(33080, 2644)


#### other after

In [664]:
covid_data_other_after = covid_data_other[covid_data_other.date_diff < 0].reset_index(drop=True)

In [665]:
covid_data_other_good = covid_data_other_after.groupby(['idx', 'DATE', 'samples_type_id']) \
                        .samples_result_id.max().reset_index().rename(columns={'samples_result_id': 'samples_result_id_max'})

covid_data_other_good['samples_result_id_min'] = covid_data_other_after.groupby(['idx', 'DATE', 'samples_type_id']) \
                                                                        .samples_result_id.min().values

covid_data_other_good['samples_result_id_count'] = covid_data_other_after.groupby(['idx', 'DATE', 'samples_type_id']) \
                                                                        .samples_result_id.count().values

In [666]:
temp_reses = []

for col in covid_data_other_good.columns[3:]:
    temp_res = pd.get_dummies(covid_data_other_good.samples_type_id)
    temp_res.values[temp_res != 0] = covid_data_other_good[col]
    temp_res.columns = ['before_covid_{}_'.format(col)+x for x in temp_res.columns]
    temp_reses.append(temp_res)
    
covid_data_other_res = pd.concat(temp_reses, axis=1)

covid_data_other_res[['idx', 'DATE']] = covid_data_other_good[['idx', 'DATE']]

In [667]:
covid_data_other_res.shape

(121564, 17)

In [668]:
covid_data_other_res = covid_data_other_res.groupby(['idx', 'DATE']).max().reset_index()

In [669]:
data_train = pd.merge(data_train, covid_data_other_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, covid_data_other_res, on=['idx', 'DATE'], how='left')

In [670]:
print(data_train.shape)
print(data_test.shape)

(131989, 2659)
(33080, 2659)


#### other in

In [671]:
covid_data_other_in = covid_data_other[covid_data_other.date_diff == 0].reset_index(drop=True)

In [672]:
covid_data_other_good = covid_data_other_in.groupby(['idx', 'DATE', 'samples_type_id']) \
                        .samples_result_id.max().reset_index().rename(columns={'samples_result_id': 'samples_result_id_max'})

covid_data_other_good['samples_result_id_min'] = covid_data_other_in.groupby(['idx', 'DATE', 'samples_type_id']) \
                                                                        .samples_result_id.min().values

covid_data_other_good['samples_result_id_count'] = covid_data_other_in.groupby(['idx', 'DATE', 'samples_type_id']) \
                                                                        .samples_result_id.count().values

In [673]:
temp_reses = []

for col in covid_data_other_good.columns[3:]:
    temp_res = pd.get_dummies(covid_data_other_good.samples_type_id)
    temp_res.values[temp_res != 0] = covid_data_other_good[col]
    temp_res.columns = ['before_covid_{}_'.format(col)+x for x in temp_res.columns]
    temp_reses.append(temp_res)
    
covid_data_other_res = pd.concat(temp_reses, axis=1)

covid_data_other_res[['idx', 'DATE']] = covid_data_other_good[['idx', 'DATE']]

In [674]:
covid_data_other_res.shape

(53155, 5)

In [675]:
covid_data_other_res = covid_data_other_res.groupby(['idx', 'DATE']).max().reset_index()

In [676]:
data_train = pd.merge(data_train, covid_data_other_res, on=['idx', 'DATE'], how='left')

data_test.DATE = pd.to_datetime(data_test.DATE)
data_test = pd.merge(data_test, covid_data_other_res, on=['idx', 'DATE'], how='left')

In [677]:
print(data_train.shape)
print(data_test.shape)

(131989, 2662)
(33080, 2662)


## Работа с questions

In [233]:
# 5. Файл-опросник
# idx - анонимизированный идентификатор пациента. по нему файлы связаны между собой.
# Остальные колонки - Название колонки - вопрос, значения в колонке - ответ
questions = pd.read_csv('data/data_for_science_questions_2020-07-27.csv', sep=';')

questions = pd.merge(questions, covid[['idx', 'pd_id', 'get_date_at']],
                     on=['idx', 'pd_id'], how='inner')
questions = questions.drop('pd_id', axis=1)
questions = questions[questions.idx.isin(pd.concat([data_train['idx'], data_test['idx']]).unique())]

questions = questions.fillna(-1)

questions_replace_dict = {'Да': 1, 'Нет': 0}

for col in questions.columns[1:-1]:
    questions = questions.replace({col: questions_replace_dict})
    questions[col] = questions[col].astype(int)
    
questions = questions.drop_duplicates().reset_index(drop=True)

questions.head(1)

Unnamed: 0,idx,Был ли контакт с пациентом больным Covid 19,Был ли подтверждённый случай заболевания Covid 19,Были ли симптомы ОРВИ на протяжении последних 5 месяцев,Были ли симптомы ковид в последние 5 месяцев?,Одышка,Повышение температуры,Потеря чувства обоняния,Слабость,Сухой кашель,get_date_at
0,206910,1,1,1,0,-1,-1,-1,-1,-1,2020-05-30


In [234]:
questions = pd.merge(questions, 
                     pd.concat([data_train[['idx', 'DATE']], data_test[['idx', 'DATE']]]),
                     on='idx', how='left')
questions = questions[questions.DATE.notnull()].reset_index(drop=True)

questions.get_date_at = pd.to_datetime(questions.get_date_at)
questions.DATE = pd.to_datetime(questions.DATE)

questions['days_btw_quest_kt'] = (questions.DATE - questions.get_date_at).apply(lambda x: x.days)

In [235]:
questions_columns = questions.columns[1:10]

### questions before KT

In [505]:
questions_before = questions[questions.days_btw_quest_kt > 0].reset_index(drop=True).copy()

In [506]:
questions_before = questions_before.groupby(['idx', 'DATE'])[questions_columns].max().reset_index()

In [507]:
data_train = pd.merge(data_train, questions_before, on=['idx', 'DATE'], how='left')
# data_train[questions_columns] = data_train[questions_columns].fillna(-1).astype(int)

data_test = pd.merge(data_test, questions_before, on=['idx', 'DATE'], how='left')
# data_test[questions_columns] = data_test[questions_columns].fillna(-1).astype(int)

In [508]:
print(data_train.shape)
print(data_test.shape)

(132289, 3144)
(33080, 3144)


### questions after KT

In [509]:
questions_after = questions[questions.days_btw_quest_kt < 0].reset_index(drop=True).copy()

In [510]:
questions_after = questions_after.groupby(['idx', 'DATE'])[questions_columns].max().reset_index()

In [511]:
questions_after_rename = {}
for col in questions_columns:
    questions_after_rename[col] = 'after_'+col

In [512]:
questions_after = questions_after.rename(columns=questions_after_rename)

In [513]:
data_train = pd.merge(data_train, questions_after, on=['idx', 'DATE'], how='left')
# data_train[list(questions_after_rename.values())] = data_train[list(questions_after_rename.values())].fillna(-1).astype(int)

data_test = pd.merge(data_test, questions_after, on=['idx', 'DATE'], how='left')
# data_test[list(questions_after_rename.values())] = data_test[list(questions_after_rename.values())].fillna(-1).astype(int)

In [514]:
print(data_train.shape)
print(data_test.shape)

(132289, 3153)
(33080, 3153)


### questions in KT

In [515]:
questions_in = questions[questions.days_btw_quest_kt == 0].reset_index(drop=True).copy()

In [516]:
questions_in = questions_in.groupby(['idx', 'DATE'])[questions_columns].max().reset_index()

In [517]:
questions_in_rename = {}
for col in questions_columns:
    questions_in_rename[col] = 'in_'+col

In [518]:
questions_in = questions_in.rename(columns=questions_in_rename)

In [519]:
data_train = pd.merge(data_train, questions_in, on=['idx', 'DATE'], how='left')
# data_train[list(questions_in_rename.values())] = data_train[list(questions_in_rename.values())].fillna(-1).astype(int)

data_test = pd.merge(data_test, questions_in, on=['idx', 'DATE'], how='left')
# data_test[list(questions_in_rename.values())] = data_test[list(questions_in_rename.values())].fillna(-1).astype(int)

In [520]:
print(data_train.shape)
print(data_test.shape)

(132289, 3162)
(33080, 3162)


# Построение модели Кэтбуст

In [678]:
data_train['month'] = data_train.DATE.apply(lambda x: x.month)
data_train['day'] = data_train.DATE.apply(lambda x: x.day)

data_test['month'] = data_test.DATE.apply(lambda x: x.month)
data_test['day'] = data_test.DATE.apply(lambda x: x.day)

In [679]:
# data_train.to_feather('temp_data/data_train.ftr')
# data_test.to_feather('temp_data/data_test.ftr')

In [5]:
data_train = pd.read_feather('temp_data/data_train.ftr')
data_test = pd.read_feather('temp_data/data_test.ftr')

In [6]:
# fip = pd.read_csv('fip.csv')
# cut_features = list(fip[fip.value >= 0.01].feature.values)
# print(fip.shape)
# len(cut_features)

#### 0

In [7]:
X_train, X_val, y_train, y_val = train_test_split(data_train[data_train.columns[3:]], data_train['KT_RESULT'],
                                                  stratify=data_train['KT_RESULT'], test_size=0.15,
                                                  random_state=17)

In [8]:
models = []

for train_index, test_index in StratifiedKFold(n_splits=8, random_state=17, shuffle=True).split(X_train, y_train):
    
    X_train_skf, y_train_skf = X_train.iloc[train_index], y_train.iloc[train_index]
    X_val_skf, y_val_skf = X_train.iloc[test_index], y_train.iloc[test_index]
    
    # X_train_skf = X_train_skf[cut_features]
    # X_val_skf = X_val_skf[cut_features]
    
    model = CatBoostClassifier(random_seed=17, verbose=200, 
                               auto_class_weights='Balanced',
                               loss_function='MultiClass',
                               class_names=[0, 1, 2, 3, 4], 
                               early_stopping_rounds=250,
                              )
    model.fit(X_train_skf, y_train_skf, 
              eval_set=(X_val_skf, y_val_skf))
    
    models.append(model)
    
    print('\n{}\n'.format(compete_metric(y_val_skf.values, model.predict(X_val_skf))[0]))

Learning rate set to 0.11835
0:	learn: 1.5478178	test: 1.5470620	best: 1.5470620 (0)	total: 1.35s	remaining: 22m 26s
200:	learn: 0.9659036	test: 1.1734941	best: 1.1729128 (189)	total: 2m 49s	remaining: 11m 13s
400:	learn: 0.8228372	test: 1.2153705	best: 1.1695491 (221)	total: 5m 30s	remaining: 8m 13s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.169549067
bestIteration = 221

Shrink model to first 222 iterations.

0.7572679640433846

Learning rate set to 0.11835
0:	learn: 1.5589127	test: 1.5597948	best: 1.5597948 (0)	total: 943ms	remaining: 15m 42s
200:	learn: 0.9633578	test: 1.1443196	best: 1.1441264 (199)	total: 2m 43s	remaining: 10m 51s
400:	learn: 0.8236621	test: 1.1518001	best: 1.1375109 (230)	total: 5m 23s	remaining: 8m 3s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.137510881
bestIteration = 230

Shrink model to first 231 iterations.

0.7563366322786989

Learning rate set to 0.11835
0:	learn: 1.5550655	test: 1.5547030	best: 1.554703

In [9]:
preds = []
for model in models:
    preds.append(model.predict_proba(X_val))

preds_default = []
for pred in np.mean(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

preds_default = []
for pred in np.median(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

0.7567697705371672
0.7545373836902505


In [10]:
preds = np.mean(preds, axis=0)

In [12]:
best_score = 0
best_weights = []

for j in range(1000):
    
    a1 = np.random.randint(0, 15)
    a2 = np.random.randint(0, 25)
    a3 = np.random.randint(15, 40)
    a4 = np.random.randint(15, 30)
    a5 = np.random.randint(0, 20)
                    
    preds_ = []
    for pred in preds:
        pred_ = pred + np.array([a1/100, a2/100, a3/100, a4/100, a5/100])
        max_pred_indx = np.where(pred_ == pred_.max())[0][0]
        preds_.append(max_pred_indx)
    metr = compete_metric(y_val.values, preds_)
    
    if metr > best_score:
        best_score = metr
        best_weights = [a1, a2, a3, a4, a5]
        print(best_score, best_weights)
                        
print(best_score, best_weights)

0.7700821364806602 [12, 20, 25, 25, 6]
0.7731374549817401 [6, 15, 25, 23, 14]
0.7755902208539744 [9, 10, 29, 19, 7]
0.7769990016489766 [6, 5, 37, 23, 2]
0.778743769660503 [2, 1, 25, 19, 1]
0.7794724701401214 [11, 18, 34, 27, 1]
0.7799308406083572 [6, 12, 28, 21, 0]
0.7813638937517231 [4, 14, 30, 21, 2]
0.7817412520430584 [8, 9, 36, 28, 1]
0.7837016749400872 [5, 15, 36, 29, 10]
0.7838810811496257 [3, 20, 32, 24, 3]
0.7838810811496257 [3, 20, 32, 24, 3]


In [15]:
fip = pd.DataFrame()
fip['feature'] = X_train.columns
fip['value'] = np.mean([model.get_feature_importance() for model in models], axis=0)
fip = fip.sort_values('value', ascending=False)

In [18]:
# 0.7838810811496257 [3, 20, 32, 24, 3]
for i, model in enumerate(models):
    model.save_model('models/0_{}_model'.format(i))
fip.to_csv('models/fip_0.csv', index=False)

#### 1

In [24]:
cut_features = list(fip[fip.value >= 0.001].feature.values)
print(fip.shape)
print(len(cut_features))

(2661, 2)
1311


In [25]:
X_train, X_val, y_train, y_val = train_test_split(data_train[cut_features], data_train['KT_RESULT'],
                                                  stratify=data_train['KT_RESULT'], test_size=0.15,
                                                  random_state=18)

In [27]:
models = []

for train_index, test_index in StratifiedKFold(n_splits=8, random_state=18, shuffle=True).split(X_train, y_train):
    
    X_train_skf, y_train_skf = X_train.iloc[train_index], y_train.iloc[train_index]
    X_val_skf, y_val_skf = X_train.iloc[test_index], y_train.iloc[test_index]
    
    model = CatBoostClassifier(random_seed=17, verbose=200, 
                               auto_class_weights='Balanced',
                               loss_function='MultiClass',
                               class_names=[0, 1, 2, 3, 4], 
                               early_stopping_rounds=250,
                              )
    model.fit(X_train_skf, y_train_skf, 
              eval_set=(X_val_skf, y_val_skf))
    
    models.append(model)
    
    print('\n{}\n'.format(compete_metric(y_val_skf.values, model.predict(X_val_skf))[0]))

Learning rate set to 0.11835
0:	learn: 1.5543473	test: 1.5544324	best: 1.5544324 (0)	total: 778ms	remaining: 12m 56s
200:	learn: 0.9651684	test: 1.1574301	best: 1.1563409 (188)	total: 1m 51s	remaining: 7m 21s
400:	learn: 0.8214927	test: 1.1679156	best: 1.1499475 (287)	total: 3m 35s	remaining: 5m 21s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.149947539
bestIteration = 287

Shrink model to first 288 iterations.

0.7642296675528724

Learning rate set to 0.11835
0:	learn: 1.5591872	test: 1.5585032	best: 1.5585032 (0)	total: 631ms	remaining: 10m 29s
200:	learn: 0.9610292	test: 1.1403715	best: 1.1397791 (199)	total: 1m 50s	remaining: 7m 18s
400:	learn: 0.8228664	test: 1.1435326	best: 1.1318511 (265)	total: 3m 34s	remaining: 5m 20s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.131851122
bestIteration = 265

Shrink model to first 266 iterations.

0.7685504446920254

Learning rate set to 0.11835
0:	learn: 1.5547678	test: 1.5574566	best: 1.5574566

In [28]:
preds = []
for model in models:
    preds.append(model.predict_proba(X_val))

preds_default = []
for pred in np.mean(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

preds_default = []
for pred in np.median(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

0.7675148984923775
0.7652489310102035


In [29]:
preds = np.mean(preds, axis=0)

In [30]:
best_score = 0
best_weights = []

for j in range(1000):
    
    a1 = np.random.randint(0, 15)
    a2 = np.random.randint(0, 25)
    a3 = np.random.randint(15, 40)
    a4 = np.random.randint(15, 30)
    a5 = np.random.randint(0, 20)
                    
    preds_ = []
    for pred in preds:
        pred_ = pred + np.array([a1/100, a2/100, a3/100, a4/100, a5/100])
        max_pred_indx = np.where(pred_ == pred_.max())[0][0]
        preds_.append(max_pred_indx)
    metr = compete_metric(y_val.values, preds_)
    
    if metr > best_score:
        best_score = metr
        best_weights = [a1, a2, a3, a4, a5]
        print(best_score, best_weights)
                        
print(best_score, best_weights)

0.7825736044746088 [5, 3, 29, 26, 14]
0.7848953404199024 [10, 24, 35, 29, 15]
0.7869872059077383 [7, 19, 33, 29, 0]
0.7872001340948386 [2, 14, 31, 24, 5]
0.7873261944081609 [3, 11, 30, 28, 2]
0.7876685967900964 [6, 23, 34, 25, 0]
0.787683369751736 [6, 24, 35, 28, 0]
0.787683369751736 [6, 24, 35, 28, 0]


In [31]:
fip = pd.DataFrame()
fip['feature'] = X_train.columns
fip['value'] = np.mean([model.get_feature_importance() for model in models], axis=0)
fip = fip.sort_values('value', ascending=False)

In [32]:
# 0.787683369751736 [6, 24, 35, 28, 0]
for i, model in enumerate(models):
    model.save_model('models/1_{}_model'.format(i))
fip.to_csv('models/fip_1.csv', index=False)

#### 2

In [34]:
cut_features = list(fip[fip.value >= 0.01].feature.values)
print(fip.shape)
print(len(cut_features))

(1311, 2)
779


In [35]:
X_train, X_val, y_train, y_val = train_test_split(data_train[cut_features], data_train['KT_RESULT'],
                                                  stratify=data_train['KT_RESULT'], test_size=0.15,
                                                  random_state=17)

In [36]:
models = []

for train_index, test_index in StratifiedKFold(n_splits=8, random_state=17, shuffle=True).split(X_train, y_train):
    
    X_train_skf, y_train_skf = X_train.iloc[train_index], y_train.iloc[train_index]
    X_val_skf, y_val_skf = X_train.iloc[test_index], y_train.iloc[test_index]
    
    model = CatBoostClassifier(random_seed=17, verbose=200, 
                               auto_class_weights='Balanced',
                               loss_function='MultiClass',
                               class_names=[0, 1, 2, 3, 4], 
                               early_stopping_rounds=250,
                              )
    model.fit(X_train_skf, y_train_skf, 
              eval_set=(X_val_skf, y_val_skf))
    
    models.append(model)
    
    print('\n{}\n'.format(compete_metric(y_val_skf.values, model.predict(X_val_skf))[0]))

Learning rate set to 0.11835
0:	learn: 1.5582499	test: 1.5578102	best: 1.5578102 (0)	total: 418ms	remaining: 6m 57s
200:	learn: 0.9467015	test: 1.1484611	best: 1.1434725 (178)	total: 1m 8s	remaining: 4m 34s
400:	learn: 0.8113910	test: 1.1749236	best: 1.1434725 (178)	total: 2m 12s	remaining: 3m 17s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.143472497
bestIteration = 178

Shrink model to first 179 iterations.

0.7625354755535498

Learning rate set to 0.11835
0:	learn: 1.5595206	test: 1.5604162	best: 1.5604162 (0)	total: 399ms	remaining: 6m 38s
200:	learn: 0.9394925	test: 1.1525290	best: 1.1515599 (182)	total: 1m 7s	remaining: 4m 26s
400:	learn: 0.8050012	test: 1.1774588	best: 1.1515599 (182)	total: 2m 13s	remaining: 3m 19s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.151559895
bestIteration = 182

Shrink model to first 183 iterations.

0.747716832884718

Learning rate set to 0.11835
0:	learn: 1.5571170	test: 1.5585508	best: 1.5585508 (0)	

In [37]:
preds = []
for model in models:
    preds.append(model.predict_proba(X_val))

preds_default = []
for pred in np.mean(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

preds_default = []
for pred in np.median(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

0.7551553087976053
0.7537784808175056


In [38]:
preds = np.mean(preds, axis=0)

In [39]:
best_score = 0
best_weights = []

for j in range(1000):
    
    a1 = np.random.randint(0, 15)
    a2 = np.random.randint(0, 25)
    a3 = np.random.randint(15, 40)
    a4 = np.random.randint(15, 30)
    a5 = np.random.randint(0, 20)
                    
    preds_ = []
    for pred in preds:
        pred_ = pred + np.array([a1/100, a2/100, a3/100, a4/100, a5/100])
        max_pred_indx = np.where(pred_ == pred_.max())[0][0]
        preds_.append(max_pred_indx)
    metr = compete_metric(y_val.values, preds_)
    
    if metr > best_score:
        best_score = metr
        best_weights = [a1, a2, a3, a4, a5]
        print(best_score, best_weights)
                        
print(best_score, best_weights)

0.7717876684586715 [0, 9, 16, 22, 9]
0.7781528359236938 [13, 11, 37, 26, 4]
0.7804066152877511 [9, 6, 31, 27, 3]
0.7837619544912868 [3, 18, 32, 24, 8]
0.7838399411846162 [4, 15, 32, 29, 4]
0.7844541023734319 [2, 13, 32, 28, 2]
0.784549943061603 [5, 17, 33, 27, 1]
0.784549943061603 [5, 17, 33, 27, 1]


In [40]:
fip = pd.DataFrame()
fip['feature'] = X_train.columns
fip['value'] = np.mean([model.get_feature_importance() for model in models], axis=0)
fip = fip.sort_values('value', ascending=False)

In [41]:
# 0.784549943061603 [5, 17, 33, 27, 1]
for i, model in enumerate(models):
    model.save_model('models/2_{}_model'.format(i))
fip.to_csv('models/fip_2.csv', index=False)

#### 3

In [49]:
cut_features = list(fip[fip.value >= 0.035].feature.values)
print(fip.shape)
print(len(cut_features))

(779, 2)
368


In [50]:
X_train, X_val, y_train, y_val = train_test_split(data_train[cut_features], data_train['KT_RESULT'],
                                                  stratify=data_train['KT_RESULT'], test_size=0.15,
                                                  random_state=17)

In [51]:
models = []

for train_index, test_index in StratifiedKFold(n_splits=8, random_state=17, shuffle=True).split(X_train, y_train):
    
    X_train_skf, y_train_skf = X_train.iloc[train_index], y_train.iloc[train_index]
    X_val_skf, y_val_skf = X_train.iloc[test_index], y_train.iloc[test_index]
    
    model = CatBoostClassifier(random_seed=17, verbose=200, 
                               auto_class_weights='Balanced',
                               loss_function='MultiClass',
                               class_names=[0, 1, 2, 3, 4], 
                               early_stopping_rounds=250,
                              )
    model.fit(X_train_skf, y_train_skf, 
              eval_set=(X_val_skf, y_val_skf))
    
    models.append(model)
    
    print('\n{}\n'.format(compete_metric(y_val_skf.values, model.predict(X_val_skf))[0]))

Learning rate set to 0.11835
0:	learn: 1.5470971	test: 1.5467406	best: 1.5467406 (0)	total: 236ms	remaining: 3m 55s
200:	learn: 0.9349732	test: 1.1584812	best: 1.1544237 (163)	total: 35.4s	remaining: 2m 20s
400:	learn: 0.7928296	test: 1.2170522	best: 1.1544237 (163)	total: 1m 8s	remaining: 1m 41s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.154423693
bestIteration = 163

Shrink model to first 164 iterations.

0.765474395803148

Learning rate set to 0.11835
0:	learn: 1.5619510	test: 1.5633574	best: 1.5633574 (0)	total: 181ms	remaining: 3m
200:	learn: 0.9250479	test: 1.1492628	best: 1.1486028 (195)	total: 33.7s	remaining: 2m 13s
400:	learn: 0.7846361	test: 1.1718221	best: 1.1434046 (220)	total: 1m 6s	remaining: 1m 39s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.143404552
bestIteration = 220

Shrink model to first 221 iterations.

0.7593598326629577

Learning rate set to 0.11835
0:	learn: 1.5591537	test: 1.5613759	best: 1.5613759 (0)	total:

In [52]:
preds = []
for model in models:
    preds.append(model.predict_proba(X_val))

preds_default = []
for pred in np.mean(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

preds_default = []
for pred in np.median(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

0.7559151500106946
0.7531836518278191


In [53]:
preds = np.mean(preds, axis=0)

In [54]:
best_score = 0
best_weights = []

for j in range(1000):
    
    a1 = np.random.randint(0, 15)
    a2 = np.random.randint(0, 25)
    a3 = np.random.randint(15, 40)
    a4 = np.random.randint(15, 30)
    a5 = np.random.randint(0, 20)
                    
    preds_ = []
    for pred in preds:
        pred_ = pred + np.array([a1/100, a2/100, a3/100, a4/100, a5/100])
        max_pred_indx = np.where(pred_ == pred_.max())[0][0]
        preds_.append(max_pred_indx)
    metr = compete_metric(y_val.values, preds_)
    
    if metr > best_score:
        best_score = metr
        best_weights = [a1, a2, a3, a4, a5]
        print(best_score, best_weights)
                        
print(best_score, best_weights)

0.7601372987347622 [11, 21, 20, 19, 18]
0.7794374496292431 [1, 13, 29, 23, 18]
0.780262940565704 [5, 13, 36, 26, 19]
0.7834386624134133 [4, 20, 34, 29, 12]
0.7836977265896262 [0, 16, 29, 22, 1]
0.784154250317114 [2, 9, 29, 26, 8]
0.784154250317114 [2, 9, 29, 26, 8]


In [55]:
fip = pd.DataFrame()
fip['feature'] = X_train.columns
fip['value'] = np.mean([model.get_feature_importance() for model in models], axis=0)
fip = fip.sort_values('value', ascending=False)

In [56]:
# 0.784154250317114 [2, 9, 29, 26, 8]
for i, model in enumerate(models):
    model.save_model('models/3_{}_model'.format(i))
fip.to_csv('models/fip_3.csv', index=False)

#### 4

In [78]:
cut_features = list(fip[fip.value >= 0.075].feature.values)
print(fip.shape)
print(len(cut_features))

(368, 2)
246


In [79]:
X_train, X_val, y_train, y_val = train_test_split(data_train[cut_features], data_train['KT_RESULT'],
                                                  stratify=data_train['KT_RESULT'], test_size=0.15,
                                                  random_state=17)

In [80]:
models = []

for train_index, test_index in StratifiedKFold(n_splits=8, random_state=17, shuffle=True).split(X_train, y_train):
    
    X_train_skf, y_train_skf = X_train.iloc[train_index], y_train.iloc[train_index]
    X_val_skf, y_val_skf = X_train.iloc[test_index], y_train.iloc[test_index]
    
    model = CatBoostClassifier(random_seed=17, verbose=200, 
                               auto_class_weights='Balanced',
                               loss_function='MultiClass',
                               class_names=[0, 1, 2, 3, 4], 
                               early_stopping_rounds=250,
                              )
    model.fit(X_train_skf, y_train_skf, 
              eval_set=(X_val_skf, y_val_skf))
    
    models.append(model)
    
    print('\n{}\n'.format(compete_metric(y_val_skf.values, model.predict(X_val_skf))[0]))

Learning rate set to 0.11835
0:	learn: 1.5606694	test: 1.5604614	best: 1.5604614 (0)	total: 131ms	remaining: 2m 10s
200:	learn: 0.9234737	test: 1.1713547	best: 1.1653584 (141)	total: 24.6s	remaining: 1m 37s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.165358429
bestIteration = 141

Shrink model to first 142 iterations.

0.7497240639160523

Learning rate set to 0.11835
0:	learn: 1.5527486	test: 1.5553457	best: 1.5553457 (0)	total: 126ms	remaining: 2m 5s
200:	learn: 0.9170061	test: 1.1550362	best: 1.1510445 (175)	total: 23.6s	remaining: 1m 34s
400:	learn: 0.7800513	test: 1.2004128	best: 1.1510445 (175)	total: 46.9s	remaining: 1m 10s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 1.151044466
bestIteration = 175

Shrink model to first 176 iterations.

0.7489064002148855

Learning rate set to 0.11835
0:	learn: 1.5582762	test: 1.5592330	best: 1.5592330 (0)	total: 133ms	remaining: 2m 13s
200:	learn: 0.9221522	test: 1.1660116	best: 1.1641415 (168)	to

In [81]:
preds = []
for model in models:
    preds.append(model.predict_proba(X_val))

preds_default = []
for pred in np.mean(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

preds_default = []
for pred in np.median(preds, axis=0):
    preds_default.append(np.where(pred == pred.max())[0][0])
print(compete_metric(y_val.values, preds_default))

0.7536131720133036
0.7537275755745712


In [82]:
preds = np.mean(preds, axis=0)

In [83]:
best_score = 0
best_weights = []

for j in range(1000):
    
    a1 = np.random.randint(0, 15)
    a2 = np.random.randint(0, 25)
    a3 = np.random.randint(15, 40)
    a4 = np.random.randint(15, 30)
    a5 = np.random.randint(0, 20)
                    
    preds_ = []
    for pred in preds:
        pred_ = pred + np.array([a1/100, a2/100, a3/100, a4/100, a5/100])
        max_pred_indx = np.where(pred_ == pred_.max())[0][0]
        preds_.append(max_pred_indx)
    metr = compete_metric(y_val.values, preds_)
    
    if metr > best_score:
        best_score = metr
        best_weights = [a1, a2, a3, a4, a5]
        print(best_score, best_weights)
                        
print(best_score, best_weights)

0.7750714165662067 [2, 11, 24, 20, 12]
0.778161449003403 [1, 3, 36, 28, 15]
0.7784477420484771 [11, 22, 31, 28, 4]
0.7789270919059526 [0, 13, 26, 25, 8]
0.7819441270888312 [3, 21, 33, 29, 1]
0.7828796829347107 [6, 11, 34, 29, 7]
0.7829537253871068 [1, 21, 31, 25, 2]
0.7829752579395103 [7, 19, 36, 29, 9]


KeyboardInterrupt: 

In [84]:
fip = pd.DataFrame()
fip['feature'] = X_train.columns
fip['value'] = np.mean([model.get_feature_importance() for model in models], axis=0)
fip = fip.sort_values('value', ascending=False)

In [85]:
# [7, 19, 36, 29, 9]
for i, model in enumerate(models):
    model.save_model('models/4_{}_model'.format(i))
fip.to_csv('models/fip_4.csv', index=False)

#### scoring

In [87]:
weights = [[0.03, 0.20, 0.32, 0.24, 0.03],
           [0.06, 0.24, 0.35, 0.28, 0.0],
           [0.05, 0.17, 0.33, 0.27, 0.01],
           [0.02, 0.09, 0.29, 0.26, 0.08],
           [0.07, 0.19, 0.36, 0.29, 0.09]]

In [96]:
preds_all = []

for i in range(5):
    
    models = []
    preds = np.zeros((data_test.shape[0], 5))
    
    for j in range(8):
        
        model = CatBoostClassifier().load_model('models/{}_{}_model'.format(i, j))
        preds += (model.predict_proba(data_test)) / 8
#          + np.array(weights[i])
    
    preds_all.append(preds)

preds_all = np.mean(preds_all, axis=0)

In [97]:
preds_ = []
for pred in preds_all:
    max_pred_indx = np.where(pred == pred.max())[0][0]
    preds_.append(max_pred_indx)

In [98]:
data_test['KT_RESULT'] = preds_

In [99]:
test_result = data_test[['idx', 'DATE', 'KT_RESULT']].copy()
test_result.DATE = test_result.DATE.astype(str)

test_result.KT_RESULT = preds_
test_result.KT_RESULT = test_result.KT_RESULT.apply(lambda x: 'КТ-'+str(x))

test_result

Unnamed: 0,idx,DATE,KT_RESULT
0,53,2020-05-05,КТ-1
1,91,2020-05-17,КТ-1
2,93,2020-05-17,КТ-0
3,138,2020-04-24,КТ-1
4,161,2020-05-16,КТ-0
...,...,...,...
33075,1296845,2020-07-02,КТ-1
33076,1297007,2020-07-03,КТ-0
33077,1297357,2020-07-05,КТ-1
33078,1297421,2020-07-01,КТ-1


In [100]:
kt_test = pd.merge(kt_test[['idx', 'DATE']], test_result.drop_duplicates(), on=['idx', 'DATE'], how='left')

In [101]:
kt_test

Unnamed: 0,idx,DATE,KT_RESULT
0,644661,2020-04-17,КТ-2
1,292115,2020-04-18,КТ-1
2,556023,2020-04-18,КТ-0
3,233104,2020-04-18,КТ-3
4,90502,2020-04-18,КТ-4
...,...,...,...
33075,1119183,2020-07-11,КТ-1
33076,473620,2020-07-11,КТ-1
33077,327819,2020-07-11,КТ-0
33078,452847,2020-07-11,КТ-1


In [102]:
kt_test.to_csv('sub_new_new_3.csv', index=False)

In [103]:
kt_test.groupby('KT_RESULT').idx.count()

KT_RESULT
КТ-0    12293
КТ-1    10654
КТ-2     4516
КТ-3     4819
КТ-4      798
Name: idx, dtype: int64

### score old

In [335]:
preds = models[0].predict_proba(X_val) / len(models)

for model in models[1:]:
    preds += model.predict_proba(X_val) / len(models)
    
preds_ = []
for pred in preds:
    pred_ = pred + np.array([0.07, 0.19, 0.36, 0.29, 0.09])
    max_pred_indx = np.where(pred_ == pred_.max())[0][0]
    preds_.append(max_pred_indx)

In [339]:
pd.Series(preds_).reset_index().groupby(0).index.count()

0
0    6113
1    3374
2    7339
3    2914
4      59
Name: index, dtype: int64

In [342]:
y_val.reset_index().groupby('KT_RESULT').index.count()

KT_RESULT
0    7198
1    8335
2    3176
3    1000
4      90
Name: index, dtype: int64

In [327]:
preds = models[0].predict_proba(data_test[X_train.columns]) / len(models)

for model in models[1:]:
    preds += model.predict_proba(data_test[X_train.columns]) / len(models)
    
preds_ = []
for pred in preds:
    pred_ = pred + np.array([0.07, 0.19, 0.36, 0.29, 0.09])
    max_pred_indx = np.where(pred_ == pred_.max())[0][0]
    preds_.append(max_pred_indx)

In [328]:
data_test['KT_RESULT'] = preds_

In [329]:
# kt_test.KT_RESULT = model.predict(data_test[X_train.columns])
# kt_test.KT_RESULT = 2

test_result = data_test[['idx', 'DATE', 'KT_RESULT']].copy()
test_result.DATE = test_result.DATE.astype(str)

test_result.KT_RESULT = preds_
test_result.KT_RESULT = test_result.KT_RESULT.apply(lambda x: 'КТ-'+str(x))

test_result

Unnamed: 0,idx,DATE,KT_RESULT
0,53,2020-05-05,КТ-1
1,91,2020-05-17,КТ-1
2,93,2020-05-17,КТ-0
3,138,2020-04-24,КТ-2
4,161,2020-05-16,КТ-0
...,...,...,...
33075,1296845,2020-07-02,КТ-1
33076,1297007,2020-07-03,КТ-0
33077,1297357,2020-07-05,КТ-1
33078,1297421,2020-07-01,КТ-2


In [330]:
kt_test = pd.merge(kt_test[['idx', 'DATE']], test_result.drop_duplicates(), on=['idx', 'DATE'], how='left')

In [334]:
kt_test.to_csv('sub_new_6.csv', index=False)

In [332]:
kt_test.groupby('KT_RESULT').idx.count()

KT_RESULT
КТ-0    10231
КТ-1     5591
КТ-2    12391
КТ-3     4771
КТ-4       96
Name: idx, dtype: int64

In [333]:
kt_train.groupby('KT_RESULT').idx.count()

KT_RESULT
КТ-0    48092
КТ-1    55685
КТ-2    21231
КТ-3     6678
КТ-4      603
Name: idx, dtype: int64

## Regression

In [900]:
X_train, X_val, y_train, y_val = train_test_split(data_train[data_train.columns[3:]], data_train['KT_RESULT'],
                                                 stratify=data_train['KT_RESULT'], test_size=0.2)

In [None]:
# MAE 0.56 / 0.65
# MAPE 0.55 / 0.61
# Poisson 0.61 / 0.68
# Quantile 0.56 / 0.65
# RMSE 0.61 / 0.68
# RMSEWithUncertainty 0.61 / 0.68
# LogLinQuantile 0.25 / 0.43

In [292]:
model = CatBoostRegressor(random_seed=17, verbose=50,
                          loss_function='RMSE')
model.fit(X_train, y_train, eval_set=(X_val, y_val))

Learning rate set to 0.112442
0:	learn: 0.8439656	test: 0.8441211	best: 0.8441211 (0)	total: 209ms	remaining: 3m 28s
50:	learn: 0.6523400	test: 0.6579450	best: 0.6579450 (50)	total: 7.65s	remaining: 2m 22s
100:	learn: 0.6304630	test: 0.6378625	best: 0.6378625 (100)	total: 14.5s	remaining: 2m 9s
150:	learn: 0.6164192	test: 0.6260306	best: 0.6260306 (150)	total: 21.2s	remaining: 1m 59s
200:	learn: 0.6083218	test: 0.6199645	best: 0.6199645 (200)	total: 27.4s	remaining: 1m 49s
250:	learn: 0.6028070	test: 0.6163956	best: 0.6163956 (250)	total: 33.6s	remaining: 1m 40s
300:	learn: 0.5985596	test: 0.6140303	best: 0.6140303 (300)	total: 40s	remaining: 1m 32s
350:	learn: 0.5947773	test: 0.6123020	best: 0.6123020 (350)	total: 46s	remaining: 1m 25s
400:	learn: 0.5912572	test: 0.6106187	best: 0.6106187 (400)	total: 52.2s	remaining: 1m 17s
450:	learn: 0.5883349	test: 0.6093178	best: 0.6093178 (450)	total: 58.4s	remaining: 1m 11s
500:	learn: 0.5859175	test: 0.6082551	best: 0.6082446 (499)	total: 1m 3

<catboost.core.CatBoostRegressor at 0x1ba2eeac0>

In [299]:
model.predict(X_val)

array([ 0.93987752, -0.04816522,  1.24081703, ...,  1.86151268,
        1.26725865,  0.64303593])

In [294]:
y_val.values

array([1, 0, 2, ..., 1, 2, 0])

In [295]:
preds_ = []

for pred in model.predict(X_val):
    if pred < 1:
        preds_.append(np.round(pred))
    elif pred > 3:
        preds_.append(np.round(pred))
    else:
        preds_.append(np.ceil(pred))

In [296]:
compete_metric(y_val.values, model.predict(X_val))

0.6932962631389123

In [301]:
compete_metric(y_val.values, np.ceil(model.predict(X_val)))

0.7428659034803832

In [298]:
compete_metric(y_val.values, preds_)

0.7634000123460071