In [1]:
from collections import Counter

dtype = {
    'Usage': 'category',
    'Description': 'category',
    'status': 'category',
}
df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])
df.drop(columns='Usage', inplace=True)

df.loc[df['cancel_date'] == '0001-01-01T00:00:00', ['cancel_date']] = None
df['cancel_date'] = pd.to_datetime(df['cancel_date'])

df['arrival_year'] = df['arrival'].dt.year

In [2]:
appearances = {}
for contract_id in df['contract_id'].unique():
    subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])
    # save the last known state
    # fill in blanks for bad years
    # handle case where cancel year might come after a series of misses
    yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}
    earliest = subset_df['arrival_year'].min()
    latest = min(subset_df['arrival_year'].max(), 2019)
    activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)]    
    if activity:
        resort_id = subset_df['resort_id'].values[0]
        appearances[str(contract_id)] = [resort_id] + activity

In [3]:
rows = []
for r in list(appearances.values()):
    resort_id, activity = r[0], r[1:]
    row = [None] * 5
    row[-len(activity):] = [s for year, s in activity]
    rows.append([resort_id] + row)

In [4]:
df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])

In [5]:
df.isnull().sum() / df.shape[0]

resort_id    0.000000
year_2015    0.515846
year_2016    0.205276
year_2017    0.098678
year_2018    0.048311
year_2019    0.000000
dtype: float64

In [6]:
df.fillna('missing', inplace=True)

In [9]:
df.nunique()

resort_id    57
year_2015     4
year_2016     4
year_2017     4
year_2018     4
year_2019     3
dtype: int64

In [11]:
X = df.drop("year_2019", axis=1)
y = df["year_2019"]

In [12]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4]


In [13]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

In [14]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(iterations=10, verbose=5, learning_rate=0.5)
clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))

0:	learn: 0.9089395	test: 0.9088238	best: 0.9088238 (0)	total: 83.9ms	remaining: 755ms
5:	learn: 0.6921539	test: 0.6930650	best: 0.6930650 (5)	total: 219ms	remaining: 146ms
9:	learn: 0.6811982	test: 0.6819978	best: 0.6819978 (9)	total: 319ms	remaining: 0us

bestTest = 0.6819978385
bestIteration = 9



<catboost.core.CatBoostClassifier at 0x7f2c8c533390>

In [15]:
y_prob = clf.predict_proba(data=X_val)
y_pred = clf.predict(data=X_val)

In [16]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7176913425345044


In [17]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_val, y_prob, multi_class="ovo", average="macro")
print('AUC:', auc)

AUC: 0.8310445038554471


In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

      active       0.75      0.87      0.80     11582
   cancelled       0.63      0.52      0.57      5562
     no-show       0.73      0.52      0.61      3578

    accuracy                           0.72     20722
   macro avg       0.70      0.64      0.66     20722
weighted avg       0.71      0.72      0.71     20722

