## tidymodels to sklearn

Sklearn implementation of this tidymodels example:

In [20]:
import pandas as pd
import numpy as np

np.random.seed(753)
cells = pd.read_csv("cells.csv")

In [2]:
from sklearn.model_selection import train_test_split

features = cells.drop('class', axis=1)
outcome = cells['class']

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    outcome, 
    test_size=0.25, 
    random_state=0,
    stratify=outcome
)

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer

tree_preprocess = make_column_transformer(
    (FunctionTransformer(), features.columns[features.dtypes != 'object'])
)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

tree_pipeline = make_pipeline(
    tree_preprocess,
    DecisionTreeClassifier()
)

In [5]:
param_grid = {
    'decisiontreeclassifier__max_depth': [1, 4, 8, 11, 15],
    'decisiontreeclassifier__ccp_alpha': [0.0000000001, 0.0000000178, 0.00000316, 0.000562, 0.1] 
}

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

tree_scorer = {
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True),
    'accuray': make_scorer(accuracy_score)
}

tree_tuner = GridSearchCV(
    tree_pipeline, param_grid, cv=5, 
    scoring=tree_scorer,
    refit='roc_auc'
)

In [17]:
tree_tuner.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('functiontransformer',
                                                                         FunctionTransformer(),
                                                                         Index(['angle_ch_1', 'area_ch_1', 'avg_inten_ch_1', 'avg_inten_ch_2',
       'avg_inten_ch_3', 'avg_inten_ch_4', 'convex_hull_area_ratio_ch_1',
       'convex_hull_perim_ratio_ch_1', 'diff_inten_density_ch_1',
       'diff_inten_density_c...
       'var_inten_ch_3', 'var_inten_ch_4', 'width_ch_1'],
      dtype='object'))])),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__ccp_alpha': [1e-10, 1.78e-08,
                                                               3.16e-06,
                                        

In [8]:
tree_tuner.best_params_

{'decisiontreeclassifier__ccp_alpha': 0.000562,
 'decisiontreeclassifier__max_depth': 4}

In [19]:
pd.DataFrame(tree_tuner.cv_results_) \
    .sort_values('mean_test_roc_auc', ascending=False) \
    [[
        'param_decisiontreeclassifier__ccp_alpha', 
        'param_decisiontreeclassifier__max_depth',
        'mean_test_accuray', 'mean_test_roc_auc'
    ]] \
    .head(5)

Unnamed: 0,param_decisiontreeclassifier__ccp_alpha,param_decisiontreeclassifier__max_depth,mean_test_accuray,mean_test_roc_auc
6,1.78e-08,4,0.793259,0.85586
11,3.16e-06,4,0.792599,0.855546
1,1e-10,4,0.79392,0.854492
16,0.000562,4,0.791939,0.853495
0,1e-10,1,0.757614,0.787783


In [10]:
# see refit='roc_auc'
best_tree = tree_tuner.best_estimator_.named_steps['decisiontreeclassifier']

In [11]:
ct = tree_tuner.best_estimator_.named_steps['columntransformer']

In [12]:
feature_importances = pd.DataFrame({'name': ct.transformers_[0][2]}) \
    .assign(importance = best_tree.feature_importances_) \
    .sort_values('importance', ascending=False)

In [13]:
import seaborn as sns
sns.barplot(x='importance', y='name', data=feature_importances.head(10))

<matplotlib.axes._subplots.AxesSubplot at 0x10e1abdd8>

In [14]:
pd.DataFrame.from_records([
    (name, scorer(tree_tuner.best_estimator_, X_test, y_test)) 
    for name, scorer in tree_scorer.items()
], columns=['metric', 'score'])

Unnamed: 0,metric,score
0,roc_auc,0.848043
1,accuray,0.782178
