In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from time import time


from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer

In [2]:
df = pd.read_csv('./data/full_data.csv', index_col='ID')
categ = pd.read_csv('./data/categorical_features.txt',
                    sep=';', header=None).values.squeeze()
# for col in categ:
#     if df[col].isna().sum() > 0:
#         value = df[col].value_counts()[0]
#         df[col].fillna(value, inplace=True)

for header in categ:
    df[header] = df[header].astype('category').cat.codes

target = ['Артериальная гипертензия', 'ОНМК', 'Стенокардия, ИБС, инфаркт миокарда',
          'Сердечная недостаточность', 'Прочие заболевания сердца']

y = df[target]
X = df.drop(target, axis=1)
X.drop('Длительность стенокардии, ИБС, инфаркта миокарда', axis=1, inplace=True)

seed = 10
X_train, X_test, y_train, y_test = train_test_split(
    X, y.values[:, 2], test_size=0.2, shuffle=True, random_state=seed)

# train_dataset = Pool(data=X_train,
#                      label=y_train,
#                      cat_features=categ)

# eval_dataset = Pool(data=X_test,
#                     label=y_test,
#                     cat_features=categ)

In [3]:
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    print(best_params)
    return best_params

In [4]:
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [13]:
clf = CatBoostClassifier(thread_count=2,
                         loss_function='Logloss',
                         task_type='CPU',
                         od_type = 'Iter',
                         verbose= False
                        )

In [14]:
search_spaces = {'iterations': Integer(10, 500),
                 'depth': Integer(2, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30),
                 'scale_pos_weight': Real(0.01, 1.0, 'uniform')}

In [15]:
opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=roc_auc,
                    cv=skf,
                    n_iter=300,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42)

In [16]:
best_params = report_perf(opt, X_train, y_train, 'CatBoost',
                          callbacks=[VerboseCallback(100),
                                     DeadlineStopper(60*10)])

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 49.5489
Function value obtained: -0.8542
Current minimum: -0.8542
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 29.3602
Function value obtained: -0.8637
Current minimum: -0.8637
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 10.7344
Function value obtained: -0.8476
Current minimum: -0.8637
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 36.5010
Function value obtained: -0.8626
Current minimum: -0.8637
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 26.0123
Function value obt

In [17]:
model = CatBoostClassifier(thread_count=2,
                           loss_function='Logloss',
                           task_type='CPU',
                           od_type='Iter',
                           verbose=False,
                           **best_params
                           )
model.fit(X_train, y_train)
# Get predicted classes
preds_class = model.predict(X_test)

In [18]:
print(f"""accuracy {accuracy_score(y_test, preds_class.squeeze())}
precision {precision_score(y_test, preds_class.squeeze())}
recall {recall_score(y_test, preds_class.squeeze())}
f1_score {f1_score(y_test, preds_class.squeeze())}
{confusion_matrix(y_test,  preds_class.squeeze())}""")

accuracy 0.9375
precision 0.8571428571428571
recall 0.4
f1_score 0.5454545454545455
[[288   2]
 [ 18  12]]


In [27]:
pd.set_option('display.max_colwidth', 1000)

In [32]:
feat_imp = pd.DataFrame(model.feature_importances_,
                        index=X_train.columns, columns=['feat'])
feat_imp = feat_imp.sort_values('feat', ascending=False)
feat_imp[:20]

Unnamed: 0,feat
Длительность сердечной недостаточности,8.83298
Лекарство / давление,5.364948
Триглицериды,4.420846
Лекарство / холестерин,4.394789
Длительность артериальной гипертензии,1.826125
17. Известно ли о наличии… / b,1.713869
115. Вареники,1.61892
17. Известно ли о наличии… / e,1.565787
10. Реклама газ. напитков / СМИ,1.245214
1. Курят / остановка,1.176326


In [21]:
model.save_model('tuning_steno.cbm')

In [23]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'Logloss',
 'iterations': 324,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'od_pval': 0,
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'l2_leaf_reg': 6,
 'random_strength': 9.999999717180683e-10,
 'od_type': 'Iter',
 'rsm': 1,
 'boost_from_average': False,
 'model_size_reg': 0.5,
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'od_wait': 20,
 'class_names': [0, 1],
 'random_seed': 0,
 'depth': 8,
 'posterior_sampling': False,
 'border_count': 2,
 'class_weights': [1, 0.595310389995575],
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'Logloss',
 'learning_rate': 0

In [24]:
import pickle

with open('model_steno.pcl', 'wb') as f:
    pickle.dump({'name': 'Стенокардия, ИБС, инфаркт миокарда',
                 'best_param': best_params,
                 'model': model}, f)