In [1]:
from collections import Counter

import autosklearn
from autosklearn.classification import AutoSklearnClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
print('autosklearn: %s' % autosklearn.__version__)

autosklearn: 0.12.5


In [2]:
dtype = {
    'Usage': 'category',
    'Description': 'category',
    'status': 'category',
}
df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])
df.drop(columns='Usage', inplace=True)

df.loc[df['cancel_date'] == '0001-01-01T00:00:00', ['cancel_date']] = None
df['cancel_date'] = pd.to_datetime(df['cancel_date'])

df['arrival_year'] = df['arrival'].dt.year

In [3]:
appearances = {}
for contract_id in df['contract_id'].unique():
    subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])
    # save the last known state
    # fill in blanks for bad years
    # handle case where cancel year might come after a series of misses
    yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}
    earliest = subset_df['arrival_year'].min()
    latest = min(subset_df['arrival_year'].max(), 2019)
    activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)]    
    if activity:
        resort_id = subset_df['resort_id'].values[0]
        appearances[str(contract_id)] = [resort_id] + activity

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [4]:
rows = []
for r in list(appearances.values()):
    resort_id, activity = r[0], r[1:]
    row = [None] * 5
    row[-len(activity):] = [s for year, s in activity]
    rows.append([resort_id] + row)

In [5]:
rows[:10]

[[28, None, None, 'active', 'active', 'active'],
 [28, 'active', 'active', 'active', 'active', 'active'],
 [30, 'active', 'active', 'active', 'cancelled', 'cancelled'],
 [39, 'no-show', 'active', 'active', 'active', 'active'],
 [38, 'cancelled', 'active', 'cancelled', 'active', 'active'],
 [38, 'active', 'cancelled', 'active', 'active', 'active'],
 [39, 'cancelled', 'cancelled', 'cancelled', 'cancelled', 'cancelled'],
 [39, 'active', 'cancelled', 'cancelled', 'cancelled', 'no-show'],
 [39, 'cancelled', 'active', 'active', 'active', 'active'],
 [39, 'active', 'active', 'active', 'active', 'active']]

In [6]:
df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])

In [7]:
df.head()

Unnamed: 0,resort_id,year_2015,year_2016,year_2017,year_2018,year_2019
0,28,,,active,active,active
1,28,active,active,active,active,active
2,30,active,active,active,cancelled,cancelled
3,39,no-show,active,active,active,active
4,38,cancelled,active,cancelled,active,active


In [8]:
df.drop('resort_id', axis=1, inplace=True)

df.year_2015 = pd.Categorical(df.year_2015)
df['year_2015'] = df.year_2015.cat.codes

df.year_2016 = pd.Categorical(df.year_2016)
df['year_2016'] = df.year_2016.cat.codes

df.year_2017 = pd.Categorical(df.year_2017)
df['year_2017'] = df.year_2017.cat.codes

df.year_2018 = pd.Categorical(df.year_2018)
df['year_2018'] = df.year_2018.cat.codes

df.year_2019 = pd.Categorical(df.year_2019)
df['year_2019'] = df.year_2019.cat.codes

In [12]:
df.head()

Unnamed: 0,year_2015,year_2016,year_2017,year_2018,year_2019
0,-1,-1,0,0,0
1,0,0,0,0,0
2,0,0,0,1,1
3,2,0,0,0,0
4,1,0,1,0,0


In [13]:
df.columns = [''] * len(df.columns)

In [14]:
df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,-1,-1,0,0,0
1,0,0,0,0,0
2,0,0,0,1,1
3,2,0,0,0,0
4,1,0,1,0,0


In [25]:
X = df.values[:,:-1]
y = pd.get_dummies(df.values[:, -1]).values

In [30]:
print(X.shape, y.shape)

(69073, 4) (69073, 3)


In [31]:
np.unique(y)

array([0, 1], dtype=uint8)

In [34]:
from sklearn.utils.multiclass import type_of_target

print(f'Type of target: {type_of_target(y)}')

Type of target: multilabel-indicator


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [38]:
automl = AutoSklearnClassifier(time_left_for_this_task=60, per_run_time_limit=30, 
                               initial_configurations_via_metalearning=0, smac_scenario_args={'runcount_limit':1})

automl.fit(X_train, y_train, dataset_name='cv_load')

AutoSklearnClassifier(initial_configurations_via_metalearning=0,
                      per_run_time_limit=30,
                      smac_scenario_args={'runcount_limit': 1},
                      time_left_for_this_task=60)

In [39]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: cv_load
  Metric: f1_macro
  Best validation score: 0.496055
  Number of target algorithm runs: 1
  Number of successful target algorithm runs: 1
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0



In [40]:
y_pred = automl.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.5807413906558456


In [41]:
print('AUC: ', roc_auc_score(y_pred, y_test))

AUC:  0.7233055776772713
