## tidymodels to sklearn

Sklearn implementation of this tidymodels example: https://www.tidymodels.org/start/tuning/

In [1]:
import pandas as pd
import numpy as np

np.random.seed(753)
cells = pd.read_csv("cells.csv")

In [2]:
from sklearn.model_selection import train_test_split

features = cells.drop('class', axis=1)
outcome = cells['class']

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    outcome, 
    test_size=0.25, 
    stratify=outcome
)

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer

tree_preprocess = make_column_transformer(
    (FunctionTransformer(), features.drop('case', axis=1).columns)
)

In [4]:
from sklearn.tree import DecisionTreeClassifier

In [5]:
from sklearn.pipeline import make_pipeline

tree_pipeline = make_pipeline(
    tree_preprocess,
    DecisionTreeClassifier()
)

In [6]:
param_grid = {
    'decisiontreeclassifier__max_depth': [1, 4, 8, 11, 15],
    'decisiontreeclassifier__ccp_alpha': [0.0000000001, 0.0000000178, 0.00000316, 0.000562, 0.1] 
}

In [7]:
from sklearn.pipeline import make_pipeline

tree_pipeline = make_pipeline(
    tree_preprocess,
    DecisionTreeClassifier()
)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

tree_scorer = {
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True),
    'accuray': make_scorer(accuracy_score)
}

tree_tuner = GridSearchCV(
    tree_pipeline, param_grid, cv=5, 
    scoring=tree_scorer,
    refit='roc_auc'
)

In [9]:
tree_res = tree_tuner.fit(X_train, y_train)

In [10]:
tree_res.best_params_

{'decisiontreeclassifier__ccp_alpha': 1.78e-08,
 'decisiontreeclassifier__max_depth': 4}

In [19]:
pd.DataFrame(tree_res.cv_results_) \
    .sort_values('mean_test_roc_auc', ascending=False) \
    .rename(columns={
        'param_decisiontreeclassifier__ccp_alpha': 'cost',
        'param_decisiontreeclassifier__max_depth': 'max_depth'
    }) \
    [[
        'cost', 'max_depth',
        'mean_test_accuray', 'mean_test_roc_auc'
    ]] \
    .head(5)

Unnamed: 0,cost,max_depth,mean_test_accuray,mean_test_roc_auc
6,1.78e-08,4,0.79724,0.839899
1,1e-10,4,0.796584,0.839053
11,3.16e-06,4,0.795257,0.837103
16,0.000562,4,0.795917,0.837055
0,1e-10,1,0.738461,0.779942


In [17]:
best_tree = tree_res.best_estimator_.named_steps['decisiontreeclassifier']

In [13]:
ct = tree_res.best_estimator_.named_steps['columntransformer']

In [14]:
feature_importances = pd.DataFrame({'name': ct.transformers_[0][2]}) \
    .assign(importance = best_tree.feature_importances_) \
    .sort_values('importance', ascending=False)

In [15]:
import seaborn as sns
sns.barplot(x='importance', y='name', data=feature_importances.head(10))

<matplotlib.axes._subplots.AxesSubplot at 0x10cdaf2e8>

In [16]:
pd.DataFrame.from_records([
    (name, scorer(tree_res.best_estimator_, X_test, y_test)) 
    for name, scorer in tree_scorer.items()
], columns=['metric', 'score'])

Unnamed: 0,metric,score
0,roc_auc,0.824385
1,accuray,0.776238
