In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from time import time


from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer

In [2]:
df = pd.read_csv('./data/full_data.csv', index_col='ID')
categ = pd.read_csv('./data/categorical_features.txt',
                    sep=';', header=None).values.squeeze()
# for col in categ:
#     if df[col].isna().sum() > 0:
#         value = df[col].value_counts()[0]
#         df[col].fillna(value, inplace=True)

for header in categ:
    df[header] = df[header].astype('category').cat.codes

target = ['Артериальная гипертензия', 'ОНМК', 'Стенокардия, ИБС, инфаркт миокарда',
          'Сердечная недостаточность', 'Прочие заболевания сердца']

y = df[target]
X = df.drop(target, axis=1)
X.drop('Длительность сердечной недостаточности', axis=1, inplace=True)

seed = 10
X_train, X_test, y_train, y_test = train_test_split(
    X, y.values[:, 3], test_size=0.2, shuffle=True, random_state=seed)

# train_dataset = Pool(data=X_train,
#                      label=y_train,
#                      cat_features=categ)

# eval_dataset = Pool(data=X_test,
#                     label=y_test,
#                     cat_features=categ)

In [3]:
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    print(best_params)
    return best_params

In [4]:
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [5]:
clf = CatBoostClassifier(thread_count=2,
                         loss_function='Logloss',
                         task_type='GPU',
                         od_type = 'Iter',
                         verbose= False
                        )

In [6]:
search_spaces = {'iterations': Integer(10, 500),
                 'depth': Integer(2, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30),
                 'scale_pos_weight': Real(0.01, 1.0, 'uniform')}

In [7]:
opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=roc_auc,
                    cv=skf,
                    n_iter=100,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42)

In [8]:
best_params = report_perf(opt, X_train, y_train, 'CatBoost',
                          callbacks=[VerboseCallback(100),
                                     DeadlineStopper(60*10)])

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 37.8970
Function value obtained: -0.8558
Current minimum: -0.8558
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 48.3730
Function value obtained: -0.8616
Current minimum: -0.8616
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 18.1277
Function value obtained: -0.8695
Current minimum: -0.8695
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 47.3077
Function value obtained: -0.8633
Current minimum: -0.8695
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 35.4224
Function value obt

In [9]:
model = CatBoostClassifier(thread_count=2,
                           loss_function='Logloss',
                           task_type='GPU',
                           od_type='Iter',
                           verbose=False,
                           **best_params
                           )
model.fit(X_train, y_train)
# Get predicted classes
preds_class = model.predict(X_test)

In [10]:
print(f"""accuracy {accuracy_score(y_test, preds_class.squeeze())}
precision {precision_score(y_test, preds_class.squeeze())}
recall {recall_score(y_test, preds_class.squeeze())}
f1_score {f1_score(y_test, preds_class.squeeze())}
{confusion_matrix(y_test,  preds_class.squeeze())}""")

accuracy 0.93125
precision 0.8
recall 0.16
f1_score 0.26666666666666666
[[294   1]
 [ 21   4]]


In [11]:
feat_imp = pd.DataFrame(model.feature_importances_,
                        index=X_train.columns, columns=['feat'])
feat_imp = feat_imp.sort_values('feat', ascending=False)
feat_imp[:10]

Unnamed: 0,feat
"Длительность стенокардии, ИБС, инфаркта миокарда",44.597039
Длительность артериальной гипертензии,13.372295
Лекарство / холестерин,7.849719
Длительность прочих заболеваний сердца,6.639936
Возраст,3.51376
Сила правая2,1.898552
FEV1,1.626894
16. Общ. Давление/ d,1.423344
ОБ1,1.399786
121. Овсянная круппа,1.237027


In [12]:
import pickle

with open('./data/model_serd_ned', 'wb') as f:
    pickle.dump({'name': 'Сердечная недостаточность',
                 'best_param': best_params,
                 'model': model}, f)