## tidymodels to sklearn

Sklearn implementation of this tidymodels example: https://www.tidymodels.org/start/case-study/

In [1]:
import pandas as pd
import numpy as np

np.random.seed(753)
hotels = pd.read_csv('https://tidymodels.org/start/case-study/hotels.csv')

In [3]:
hotels.shape

(50000, 23)

In [6]:
hotels.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49990,49991,49992,49993,49994,49995,49996,49997,49998,49999
hotel,City_Hotel,City_Hotel,Resort_Hotel,Resort_Hotel,Resort_Hotel,City_Hotel,Resort_Hotel,City_Hotel,City_Hotel,City_Hotel,...,Resort_Hotel,Resort_Hotel,City_Hotel,City_Hotel,City_Hotel,Resort_Hotel,Resort_Hotel,City_Hotel,Resort_Hotel,City_Hotel
lead_time,217,2,95,143,136,67,47,56,80,6,...,283,197,414,225,73,172,48,155,140,12
stays_in_weekend_nights,1,0,2,2,1,2,0,0,0,2,...,2,2,0,2,0,0,0,0,2,2
stays_in_week_nights,3,1,5,6,4,2,2,3,4,2,...,8,8,2,4,2,2,4,4,5,1
adults,2,2,2,2,2,2,2,0,2,2,...,2,2,2,2,2,2,2,2,2,2
children,none,none,none,none,none,none,children,children,none,children,...,none,none,none,none,none,children,none,none,none,none
meal,BB,BB,BB,HB,HB,SC,BB,BB,BB,BB,...,BB,Undefined,HB,BB,SC,BB,FB,BB,HB,BB
country,DEU,PRT,GBR,ROU,PRT,GBR,ESP,ESP,FRA,FRA,...,GBR,GBR,DEU,BRA,FRA,PRT,PRT,DEU,GBR,DEU
market_segment,Offline_TA/TO,Direct,Online_TA,Online_TA,Direct,Online_TA,Direct,Online_TA,Online_TA,Online_TA,...,Offline_TA/TO,Offline_TA/TO,Groups,Online_TA,Online_TA,Direct,Direct,Offline_TA/TO,Direct,Online_TA
distribution_channel,TA/TO,Direct,TA/TO,TA/TO,Direct,TA/TO,Direct,TA/TO,TA/TO,TA/TO,...,TA/TO,TA/TO,TA/TO,TA/TO,TA/TO,Direct,Direct,TA/TO,Direct,TA/TO


In [34]:
hotels \
    .groupby('children') \
    .agg(count=('children', 'count')) \
    .assign(prop = lambda x: x['count'] / x['count'].sum())

Unnamed: 0_level_0,count,prop
children,Unnamed: 1_level_1,Unnamed: 2_level_1
children,4038,0.08076
none,45962,0.91924


In [36]:
from sklearn.model_selection import train_test_split

features = hotels.drop('children', axis=1)
outcome = hotels['children']

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    outcome, 
    test_size=0.25, 
    stratify=outcome
)

https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/

https://stackoverflow.com/questions/16437022/how-to-tune-parameters-of-nested-pipelines-by-gridsearchcv-in-scikit-learn

In [41]:
features

Unnamed: 0,hotel,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,meal,country,market_segment,distribution_channel,is_repeated_guest,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,average_daily_rate,required_car_parking_spaces,total_of_special_requests,arrival_date
0,City_Hotel,217,1,3,2,BB,DEU,Offline_TA/TO,TA/TO,0,...,A,A,0,No_Deposit,0,Transient-Party,80.75,none,1,2016-09-01
1,City_Hotel,2,0,1,2,BB,PRT,Direct,Direct,0,...,D,K,0,No_Deposit,0,Transient,170.00,none,3,2017-08-25
2,Resort_Hotel,95,2,5,2,BB,GBR,Online_TA,TA/TO,0,...,A,A,2,No_Deposit,0,Transient,8.00,none,2,2016-11-19
3,Resort_Hotel,143,2,6,2,HB,ROU,Online_TA,TA/TO,0,...,A,A,0,No_Deposit,0,Transient,81.00,none,1,2016-04-26
4,Resort_Hotel,136,1,4,2,HB,PRT,Direct,Direct,0,...,F,F,0,No_Deposit,0,Transient,157.60,none,4,2016-12-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Resort_Hotel,172,0,2,2,BB,PRT,Direct,Direct,0,...,A,A,1,No_Deposit,0,Transient,73.39,none,1,2016-10-07
49996,Resort_Hotel,48,0,4,2,FB,PRT,Direct,Direct,0,...,A,B,2,No_Deposit,0,Transient,158.00,none,0,2015-09-01
49997,City_Hotel,155,0,4,2,BB,DEU,Offline_TA/TO,TA/TO,0,...,A,A,0,No_Deposit,0,Transient,82.50,none,1,2017-07-26
49998,Resort_Hotel,140,2,5,2,HB,GBR,Direct,Direct,0,...,G,G,0,No_Deposit,0,Transient,143.00,none,0,2016-04-28


In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

log_preprocess = make_column_transformer(
    (VarianceThreshold(), features.columns),
    (StandardScaler(), features.columns)
)

In [2]:
from sklearn.model_selection import train_test_split

features = cells.drop('class', axis=1)
outcome = cells['class']

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    outcome, 
    test_size=0.25, 
    stratify=outcome
)

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer

tree_preprocess = make_column_transformer(
    (FunctionTransformer(), features.drop('case', axis=1).columns)
)

In [4]:
from sklearn.tree import DecisionTreeClassifier

In [5]:
from sklearn.pipeline import make_pipeline

tree_pipeline = make_pipeline(
    tree_preprocess,
    DecisionTreeClassifier()
)

In [6]:
param_grid = {
    'decisiontreeclassifier__max_depth': [1, 4, 8, 11, 15],
    'decisiontreeclassifier__ccp_alpha': [0.0000000001, 0.0000000178, 0.00000316, 0.000562, 0.1] 
}

In [7]:
from sklearn.pipeline import make_pipeline

tree_pipeline = make_pipeline(
    tree_preprocess,
    DecisionTreeClassifier()
)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

tree_scorer = {
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True),
    'accuray': make_scorer(accuracy_score)
}

tree_tuner = GridSearchCV(
    tree_pipeline, param_grid, cv=5, 
    scoring=tree_scorer,
    refit='roc_auc'
)

In [9]:
tree_res = tree_tuner.fit(X_train, y_train)

In [10]:
tree_res.best_params_

{'decisiontreeclassifier__ccp_alpha': 1.78e-08,
 'decisiontreeclassifier__max_depth': 4}

In [19]:
pd.DataFrame(tree_res.cv_results_) \
    .sort_values('mean_test_roc_auc', ascending=False) \
    .rename(columns={
        'param_decisiontreeclassifier__ccp_alpha': 'cost',
        'param_decisiontreeclassifier__max_depth': 'max_depth'
    }) \
    [[
        'cost', 'max_depth',
        'mean_test_accuray', 'mean_test_roc_auc'
    ]] \
    .head(5)

Unnamed: 0,cost,max_depth,mean_test_accuray,mean_test_roc_auc
6,1.78e-08,4,0.79724,0.839899
1,1e-10,4,0.796584,0.839053
11,3.16e-06,4,0.795257,0.837103
16,0.000562,4,0.795917,0.837055
0,1e-10,1,0.738461,0.779942


In [17]:
best_tree = tree_res.best_estimator_.named_steps['decisiontreeclassifier']

In [13]:
ct = tree_res.best_estimator_.named_steps['columntransformer']

In [14]:
feature_importances = pd.DataFrame({'name': ct.transformers_[0][2]}) \
    .assign(importance = best_tree.feature_importances_) \
    .sort_values('importance', ascending=False)

In [15]:
import seaborn as sns
sns.barplot(x='importance', y='name', data=feature_importances.head(10))

<matplotlib.axes._subplots.AxesSubplot at 0x10cdaf2e8>

In [16]:
pd.DataFrame.from_records([
    (name, scorer(tree_res.best_estimator_, X_test, y_test)) 
    for name, scorer in tree_scorer.items()
], columns=['metric', 'score'])

Unnamed: 0,metric,score
0,roc_auc,0.824385
1,accuray,0.776238
