In [1]:
from collections import Counter
import pandas as pd

In [2]:
dtype = {
    'Usage': 'category',
    'Description': 'category',
    'status': 'category',
}
df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])
df.drop(columns='Usage', inplace=True)

df.loc[df['cancel_date'] == '0001-01-01T00:00:00', ['cancel_date']] = None
df['cancel_date'] = pd.to_datetime(df['cancel_date'])

df['arrival_year'] = df['arrival'].dt.year

In [3]:
df.head()

Unnamed: 0,owner_id,contract_id,resort_id,Description,cancel_date,created,arrival,departure,status,arrival_year
0,10,10,28,Fixed-Annual,2017-12-19,2013-07-09,2015-10-17,2015-10-24,active,2015
1,10,10,28,Fixed-Annual,2017-12-19,2014-12-20,2016-10-15,2016-10-22,active,2016
2,10,10,28,Fixed-Annual,2017-12-19,2015-11-17,2017-10-21,2017-10-28,active,2017
3,100,100,28,Fixed-Annual,NaT,2013-07-09,2015-10-24,2015-10-31,active,2015
4,100,100,28,Fixed-Annual,NaT,2014-12-20,2016-10-22,2016-10-29,active,2016


In [4]:
df.status.value_counts()

active       215049
cancelled    118973
no-show       53850
Name: status, dtype: int64

In [5]:
appearances = {}
for contract_id in df['contract_id'].unique():
    subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])
    # save the last known state
    # fill in blanks for bad years
    # handle case where cancel year might come after a series of misses
    yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}
    earliest = subset_df['arrival_year'].min()
    latest = min(subset_df['arrival_year'].max(), 2019)
    activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)]    
    if activity:
        resort_id = subset_df['resort_id'].values[0]
        appearances[str(contract_id)] = [resort_id] + activity

Each unique contract's status in every year.

In [6]:
rows = []
for r in list(appearances.values()):
    resort_id, activity = r[0], r[1:]
    row = [None] * 5
    row[-len(activity):] = [s for year, s in activity]
    rows.append([resort_id] + row)

In [7]:
rows[:10]

[[28, None, None, 'active', 'active', 'active'],
 [28, 'active', 'active', 'active', 'active', 'active'],
 [30, 'active', 'active', 'active', 'cancelled', 'cancelled'],
 [39, 'no-show', 'active', 'active', 'active', 'active'],
 [38, 'cancelled', 'active', 'cancelled', 'active', 'active'],
 [38, 'active', 'cancelled', 'active', 'active', 'active'],
 [39, 'cancelled', 'cancelled', 'cancelled', 'cancelled', 'cancelled'],
 [39, 'active', 'cancelled', 'cancelled', 'cancelled', 'no-show'],
 [39, 'cancelled', 'active', 'active', 'active', 'active'],
 [39, 'active', 'active', 'active', 'active', 'active']]

Each resort's status in each year, within each unique contract.

In [8]:
df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])

In [9]:
df.isnull().sum()/df.shape[0]

resort_id    0.000000
year_2015    0.515846
year_2016    0.205276
year_2017    0.098678
year_2018    0.048311
year_2019    0.000000
dtype: float64

In [10]:
df['year_2019'].value_counts()

active       38655
cancelled    18509
no-show      11909
Name: year_2019, dtype: int64

In [11]:
df.head()

Unnamed: 0,resort_id,year_2015,year_2016,year_2017,year_2018,year_2019
0,28,,,active,active,active
1,28,active,active,active,active,active
2,30,active,active,active,cancelled,cancelled
3,39,no-show,active,active,active,active
4,38,cancelled,active,cancelled,active,active


In [12]:
df.drop('resort_id', axis=1, inplace=True)

In [13]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.30, random_state=0)

In [14]:
train_data.to_csv('data/capital_train.csv', encoding='utf-8', index=False)
test_data.to_csv('data/capital_test.csv', encoding='utf-8', index=False)

In [15]:
train_data.year_2019.value_counts()

active       27073
cancelled    12947
no-show       8331
Name: year_2019, dtype: int64

In [16]:
test_data.year_2019.value_counts()

active       11582
cancelled     5562
no-show       3578
Name: year_2019, dtype: int64

In [17]:
label_column = 'year_2019'
print("Summary of class variable: \n", train_data[label_column].describe())

Summary of class variable: 
 count      48351
unique         3
top       active
freq       27073
Name: year_2019, dtype: object


In [18]:
import autogluon as ag
from autogluon import TabularPrediction as task

predictor = task.fit(train_data=train_data, label=label_column)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20210330_072731/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20210330_072731/
AutoGluon Version:  0.0.15
Train Data Rows:    48351
Train Data Columns: 4
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	3 unique label values:  ['cancelled', 'active', 'no-show']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 3
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6212.4 MB
	Train Data (Original)  Memory Usage: 11.06 MB (0.2% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtype

In [20]:
y_test = test_data[label_column]
test_data_nolab = test_data.drop(labels=[label_column],axis=1)

In [21]:
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  ", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.6706881575137535
Evaluations on test data:
{
    "accuracy": 0.6706881575137535,
    "accuracy_score": 0.6706881575137535,
    "balanced_accuracy_score": 0.5651247356842359,
    "matthews_corrcoef": 0.4102990853305558
}


Predictions:   ['active' 'cancelled' 'active' ... 'active' 'active' 'active']


Detailed (per-class) classification report:
{
    "active": {
        "precision": 0.7244427934621099,
        "recall": 0.8419098601277845,
        "f1-score": 0.7787716636051434,
        "support": 11582
    },
    "cancelled": {
        "precision": 0.5970003895597974,
        "recall": 0.5510607695073715,
        "f1-score": 0.5731114435302916,
        "support": 5562
    },
    "no-show": {
        "precision": 0.5084586466165414,
        "recall": 0.3024035774175517,
        "f1-score": 0.37924991237294076,
        "support": 3578
    },
    "accuracy": 0.6706881575137535,
    "macro avg": {
        "precision": 0.6099672765461496,
        "recall": 0.5651247356842359,
        "f1-score": 0.5770443398361252,
        "support": 20722
    },
    "weighted avg": {
        "precision": 0.6529426521669596,
        "recall": 0.6706881575137535,
        "f1-score": 0.6545862099826578,
        "support": 20722
    }
}


In [22]:
pred_probs = predictor.predict_proba(test_data_nolab)
positive_class = [label for label in predictor.class_labels if predictor.class_labels_internal_map[label]==1][0]
print(f"Predicted probabilities of class '{positive_class}':", pred_probs)

Predicted probabilities of class 'cancelled': [[0.7598184  0.17107713 0.06910452]
 [0.15061373 0.75407445 0.09531184]
 [0.87844694 0.07554352 0.04600956]
 ...
 [0.82452995 0.08081903 0.094651  ]
 [0.82452995 0.08081903 0.094651  ]
 [0.52298206 0.36374778 0.11327014]]


In [23]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                        model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0          CatboostClassifier     0.6768       0.010369   3.654064                0.010369           3.654064            0       True          8
1     weighted_ensemble_k0_l1     0.6768       0.011087   4.173240                0.000718           0.519176            1       True         10
2          LightGBMClassifier     0.6744       0.018447   0.809488                0.018447           0.809488            0       True          6
3    ExtraTreesClassifierGini     0.6744       0.206662   1.135898                0.206662           1.135898            0       True          4
4    ExtraTreesClassifierEntr     0.6744       0.207086   1.032413                0.207086           1.032413            0       True          5
5    LightGBMClassifierCustom     0.6740       0.040228   1.442169  

In [24]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)

AutoGluon infers problem type is:  multiclass
AutoGluon identified the following types of features:
('category', []) : 4 | ['year_2015', 'year_2016', 'year_2017', 'year_2018']


In [25]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatboostClassifier,0.670688,0.6768,0.03298,0.010369,3.654064,0.03298,0.010369,3.654064,0,True,8
1,weighted_ensemble_k0_l1,0.670688,0.6768,0.035819,0.011087,4.17324,0.00284,0.000718,0.519176,1,True,10
2,LightGBMClassifier,0.670061,0.6744,0.107309,0.018447,0.809488,0.107309,0.018447,0.809488,0,True,6
3,ExtraTreesClassifierGini,0.669578,0.6744,0.221292,0.206662,1.135898,0.221292,0.206662,1.135898,0,True,4
4,ExtraTreesClassifierEntr,0.669578,0.6744,0.22436,0.207086,1.032413,0.22436,0.207086,1.032413,0,True,5
5,LightGBMClassifierCustom,0.669385,0.674,0.308849,0.040228,1.442169,0.308849,0.040228,1.442169,0,True,9
6,RandomForestClassifierEntr,0.669289,0.6732,0.218579,0.207045,1.234125,0.218579,0.207045,1.234125,0,True,3
7,RandomForestClassifierGini,0.669289,0.6732,0.224014,0.207224,1.23691,0.224014,0.207224,1.23691,0,True,2
8,NeuralNetClassifier,0.66924,0.6736,1.106056,0.114016,60.229692,1.106056,0.114016,60.229692,0,True,1
9,LightGBMClassifierXT,0.667551,0.6728,0.039465,0.011456,0.591041,0.039465,0.011456,0.591041,0,True,7


In [30]:
predictor.feature_importance(test_data)

Computing raw permutation importance for 4 features on weighted_ensemble_k0_l1 ...
	0.12s	= Expected runtime
	0.12s	= Actual runtime


year_2018    0.125
year_2017    0.040
year_2016    0.018
year_2015    0.014
dtype: float64