In [None]:
import tensorflow as tf
from tensorflow import keras

import os
import tempfile

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
!gdown --id 1MIKKj8Gi-xUwhsYt6xEV6FSmX0_Le8iL
!unzip -q 'data-storm-20.zip'

In [None]:
file = tf.keras.utils
raw_df = pd.read_csv('/content/Hotel-A-train.csv', index_col=0)
val_df = pd.read_csv('/content/Hotel-A-validation.csv', index_col=0)
raw_df = raw_df.append(val_df)
raw_df.head()

In [None]:
object_cols = ['Gender', 'Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Reservation_Status', 'Use_Promotion']
# these columns need to be onehot encode or vectorise

dates = ['Expected_checkin', 'Expected_checkout', 'Booking_date'] #need to feature engineer

In [None]:
raw_df[object_cols].describe()

In [None]:
raw_df['Reservation_Status'] = raw_df['Reservation_Status'].map({'Check-In':0, 'Canceled':1, 'No-Show':2})

In [None]:
checkin, canceled, noshow = np.bincount(raw_df['Reservation_Status'])
total = checkin + canceled + noshow
print('Examples:\n    Total: {}\n    Check-In: {} ({:.2f}% of total)\n    Canceled: {} ({:.2f}% of total)\n     No-Show: {} ({:.2f}% of total)\n'.format(
    total, checkin, 100 * checkin / total, canceled, 100 * canceled / total, noshow, 100 * noshow / total))

In [None]:
cleaned_df = raw_df.copy()

#one-hot encoding
one_hot_encoded_lst = ['Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Deposit_type', 'Booking_channel'] 
cleaned_df = pd.get_dummies(cleaned_df, columns=one_hot_encoded_lst)

#binary value encoding
cleaned_df['Gender'] = cleaned_df['Gender'].map({'F':0, 'M':1})
cleaned_df['Visted_Previously'] = cleaned_df['Visted_Previously'].map({'No':0, 'Yes':1})
cleaned_df['Previous_Cancellations'] = cleaned_df['Previous_Cancellations'].map({'No':0, 'Yes':1})
cleaned_df['Required_Car_Parking'] = cleaned_df['Required_Car_Parking'].map({'Yes':1, 'No':0})
cleaned_df['Use_Promotion'] = cleaned_df['Use_Promotion'].map({'Yes':1, 'No':0})

cleaned_df[dates[0]] = pd.to_datetime(cleaned_df[dates[0]])
cleaned_df[dates[1]] = pd.to_datetime(cleaned_df[dates[1]])
cleaned_df[dates[2]] = pd.to_datetime(cleaned_df[dates[2]])

cleaned_df['Expected_stay'] = (cleaned_df[dates[1]] - cleaned_df[dates[0]]).dt.days
cleaned_df['Booking_to_checkingin'] = (cleaned_df[dates[0]] - cleaned_df[dates[2]]).dt.days
cleaned_df['Month_of_stay'] = cleaned_df[dates[0]].dt.month
cleaned_df['Actual_cost'] = cleaned_df['Expected_stay'] * (cleaned_df['Room_Rate']*(100 - cleaned_df['Discount_Rate']))

weekdayin = cleaned_df[dates[0]].dt.dayofweek
weekdayout = cleaned_df[dates[1]].dt.dayofweek
fina = []
for x,y in zip(weekdayin, weekdayout):
  t = []
  if y >= x:
    for i in range(x, y + 1):
      t.append(i)
    if 5 in t or 6 in t:
      fina.append(1)
    else:
      fina.append(0)
  else:
    for i in range(x, 7):
      t.append(i)
    for j in range(0, y + 1):
      t.append(i)
    if 5 in t or 6 in t:
      fina.append(1)
    else:
      fina.append(0)
cleaned_df['weekend_stay'] = pd.DataFrame(fina,columns=['weekend_stay'])['weekend_stay'].values

cleaned_df = cleaned_df.drop(dates, 1)

In [None]:
cleaned_df.describe()

In [None]:
eps = 0.001
cleaned_df['Log Actual_cost'] = np.log(cleaned_df.pop('Actual_cost')+eps)
cleaned_df['Log Room_Rate'] = np.log(cleaned_df.pop('Room_Rate')+eps)

In [None]:
cleaned_df.describe()

In [None]:
cleaned_df.dtypes

In [None]:
train_df

In [None]:
train_df, test_df = train_test_split(cleaned_df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)

train_labels = np.array(train_df.pop('Reservation_Status'))
bool_checkin_labels = train_labels == 0
bool_canceled_labels = train_labels == 1
bool_noshow_labels = train_labels == 2
val_labels = np.array(val_df.pop('Reservation_Status'))
test_labels = np.array(test_df.pop('Reservation_Status'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

In [None]:
train_labels = tf.keras.utils.to_categorical(train_labels, 3)
val_labels = tf.keras.utils.to_categorical(val_labels, 3)
test_labels = tf.keras.utils.to_categorical(test_labels, 3)

In [None]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

In [None]:
checkin_df = pd.DataFrame(train_features[ bool_checkin_labels], columns=train_df.columns)
canceled_df = pd.DataFrame(train_features[bool_canceled_labels], columns=train_df.columns)
noshow_df = pd.DataFrame(train_features[bool_noshow_labels], columns=train_df.columns)

sns.jointplot(checkin_df['Booking_to_checkingin'], checkin_df['Log Actual_cost'],
              kind='hex', xlim=(-5,5), ylim=(-5,5))
plt.suptitle("Check-In distribution")

sns.jointplot(canceled_df['Booking_to_checkingin'], canceled_df['Log Actual_cost'],
              kind='hex', xlim=(-5,5), ylim=(-5,5))
_ = plt.suptitle("Canceled distribution")

sns.jointplot(noshow_df['Booking_to_checkingin'], noshow_df['Log Actual_cost'],
              kind='hex', xlim=(-5,5), ylim=(-5,5))
_ = plt.suptitle("No-Show distribution")

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

def make_model(metrics=METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(
          16, activation='relu',
          input_shape=(train_features.shape[-1],)),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(3, activation='softmax',
                         bias_initializer=output_bias),
  ])

  model.compile(
      optimizer=keras.optimizers.Adam(lr=1e-3),
      loss=keras.losses.CategoricalCrossentropy(),
      metrics=metrics)

  return model

In [None]:
EPOCHS = 100
BATCH_SIZE = 2048

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [None]:
model = make_model()
model.summary()

In [None]:
model.predict(train_features[:10]).tolist()

In [None]:
initial_weights = os.path.join(tempfile.mkdtemp(), 'initial_weights')
model.save_weights(initial_weights)

In [None]:
val_features[0]

In [None]:
model = make_model()
model.load_weights(initial_weights)
model.layers[-1].bias.assign([0.0, 0.0, 0.0])
zero_bias_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_data=(val_features, val_labels), 
    verbose=0)

In [None]:
model = make_model()
model.load_weights(initial_weights)
careful_bias_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_data=(val_features, val_labels), 
    verbose=0)

In [None]:
def plot_loss(history, label, n):
  # Use a log scale on y-axis to show the wide range of values.
  plt.semilogy(history.epoch, history.history['loss'],
               color=colors[n], label='Train ' + label)
  plt.semilogy(history.epoch, history.history['val_loss'],
               color=colors[n], label='Val ' + label,
               linestyle="--")
  plt.xlabel('Epoch')
  plt.ylabel('Loss')

In [None]:
plot_loss(zero_bias_history, "Zero Bias", 0)
plot_loss(careful_bias_history, "Careful Bias", 1)

In [None]:
model = make_model()
model.load_weights(initial_weights)
baseline_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels))

In [None]:
def plot_metrics(history):
  metrics = ['loss', 'auc', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend()

In [None]:
plot_metrics(baseline_history)

In [None]:
train_predictions_baseline = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE)

In [None]:
def plot_cm(labels, predictions, p=0.5):
  cm = confusion_matrix(labels.argmax(axis=1), predictions.argmax(axis=1) > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.2f}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

In [None]:
baseline_results = model.evaluate(test_features, test_labels,
                                  batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, baseline_results):
  print(name, ': ', value)
print()

plot_cm(test_labels, test_predictions_baseline)

In [None]:
weight_for_0 = (1 / checkin)*(total)/2.0 
weight_for_1 = (1 / canceled)*(total)/2.0
weight_for_2 = (1 / noshow)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 2: {:.2f}'.format(weight_for_2))

In [None]:
weighted_model = make_model()
weighted_model.load_weights(initial_weights)

weighted_history = weighted_model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels),
    # The class weights go here
    class_weight=class_weight) 

In [None]:
plot_metrics(weighted_history)

In [None]:
train_predictions_weighted = weighted_model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions_weighted = weighted_model.predict(test_features, batch_size=BATCH_SIZE)

In [None]:
weighted_results = weighted_model.evaluate(test_features, test_labels,
                                           batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(weighted_model.metrics_names, weighted_results):
  print(name, ': ', value)
print()

plot_cm(test_labels, test_predictions_weighted)

In [None]:
checkin_features = train_features[bool_checkin_labels]
canceled_features = train_features[bool_canceled_labels]
noshow_features = train_features[bool_noshow_labels]

checkin_labels = train_labels[bool_checkin_labels]
canceled_labels = train_labels[bool_canceled_labels]
noshow_labels = train_labels[bool_noshow_labels]

In [None]:
ids = np.arange(len(canceled_features))
choices = np.random.choice(ids, len(checkin_features))

res_canceled_features = canceled_features[choices]
res_canceled_labels = canceled_labels[choices]

res_canceled_features.shape

In [None]:
ids = np.arange(len(noshow_features))
choices = np.random.choice(ids, len(checkin_features))

res_noshow_features = noshow_features[choices]
res_noshow_labels = noshow_labels[choices]

res_noshow_features.shape

In [None]:
resampled_features = np.concatenate([res_noshow_features, res_canceled_features, checkin_features], axis=0)
resampled_labels = np.concatenate([res_noshow_labels, res_canceled_labels, checkin_labels], axis=0)

order = np.arange(len(resampled_labels))
np.random.shuffle(order)
resampled_features = resampled_features[order]
resampled_labels = resampled_labels[order]

resampled_features.shape

In [None]:
BUFFER_SIZE = 100000

def make_ds(features, labels):
  ds = tf.data.Dataset.from_tensor_slices((features, labels))#.cache()
  ds = ds.shuffle(BUFFER_SIZE).repeat()
  return ds

checkin_ds = make_ds(checkin_features, checkin_labels)
canceled_ds = make_ds(canceled_features, canceled_labels)
noshow_ds = make_ds(noshow_features, noshow_labels)

In [None]:
for features, label in checkin_ds.take(1):
  print("Features:\n", features.numpy())
  print()
  print("Label: ", label.numpy())

In [None]:
resampled_ds = tf.data.experimental.sample_from_datasets([checkin_ds, canceled_ds, noshow_ds], weights=[0.3, 0.3, 0.3])
resampled_ds = resampled_ds.batch(BATCH_SIZE).prefetch(2)

In [None]:
for features, label in resampled_ds.take(1):
  print(label.numpy().mean())

In [None]:
resampled_steps_per_epoch = np.ceil(2.0*checkin/BATCH_SIZE)
resampled_steps_per_epoch

In [None]:
resampled_model = make_model()
resampled_model.load_weights(initial_weights)

# Reset the bias to zero, since this dataset is balanced.
output_layer = resampled_model.layers[-1] 
output_layer.bias.assign([0, 0, 0])

val_ds = tf.data.Dataset.from_tensor_slices((val_features, val_labels)).cache()
val_ds = val_ds.batch(BATCH_SIZE).prefetch(2) 

resampled_history = resampled_model.fit(
    resampled_ds,
    epochs=EPOCHS,
    steps_per_epoch=resampled_steps_per_epoch,
    callbacks=[early_stopping],
    validation_data=val_ds),

In [None]:
resampled_model = make_model()
resampled_model.load_weights(initial_weights)

# Reset the bias to zero, since this dataset is balanced.
output_layer = resampled_model.layers[-1] 
output_layer.bias.assign([0, 0, 0])

resampled_history = resampled_model.fit(
    resampled_ds,
    # These are not real epochs
    steps_per_epoch=20,
    epochs=10*EPOCHS,
    callbacks=[early_stopping],
    validation_data=(val_ds))

In [None]:
plot_metrics(resampled_history)

In [None]:
train_predictions_resampled = resampled_model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions_resampled = resampled_model.predict(test_features, batch_size=BATCH_SIZE)

In [None]:
resampled_results = resampled_model.evaluate(test_features, test_labels,
                                             batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(resampled_model.metrics_names, resampled_results):
  print(name, ': ', value)
print()

plot_cm(test_labels, test_predictions_resampled, p=0.66)

In [None]:
test_predictions_resampled.argmax(axis=1).tolist().count(2)

In [None]:
submission_df = pd.read_csv('/content/Hotel-A-test.csv', index_col=0)

In [None]:
def prepare_submission_df(raw_df, one_hot_encoded_lst, dates, scaler):
  cleaned_df = raw_df.copy()

  #one-hot encoding
  one_hot_encoded_lst = ['Ethnicity', 'Educational_Level',
        'Income', 'Country_region', 'Hotel_Type', 
        'Meal_Type', 'Deposit_type', 'Booking_channel'] 
  cleaned_df = pd.get_dummies(cleaned_df, columns=one_hot_encoded_lst)

  #binary value encoding
  cleaned_df['Gender'] = cleaned_df['Gender'].map({'F':0, 'M':1})
  cleaned_df['Visted_Previously'] = cleaned_df['Visted_Previously'].map({'No':0, 'Yes':1})
  cleaned_df['Previous_Cancellations'] = cleaned_df['Previous_Cancellations'].map({'No':0, 'Yes':1})
  cleaned_df['Required_Car_Parking'] = cleaned_df['Required_Car_Parking'].map({'Yes':1, 'No':0})
  cleaned_df['Use_Promotion'] = cleaned_df['Use_Promotion'].map({'Yes':1, 'No':0})

  cleaned_df[dates[0]] = pd.to_datetime(cleaned_df[dates[0]])
  cleaned_df[dates[1]] = pd.to_datetime(cleaned_df[dates[1]])
  cleaned_df[dates[2]] = pd.to_datetime(cleaned_df[dates[2]])

  cleaned_df['Expected_stay'] = (cleaned_df[dates[1]] - cleaned_df[dates[0]]).dt.days
  cleaned_df['Booking_to_checkingin'] = (cleaned_df[dates[0]] - cleaned_df[dates[2]]).dt.days
  cleaned_df['Month_of_stay'] = cleaned_df[dates[0]].dt.month
  cleaned_df['Actual_cost'] = cleaned_df['Expected_stay'] * (cleaned_df['Room_Rate']*(100 - cleaned_df['Discount_Rate']))

  weekdayin = cleaned_df[dates[0]].dt.dayofweek
  weekdayout = cleaned_df[dates[1]].dt.dayofweek
  fina = []
  for x,y in zip(weekdayin, weekdayout):
    t = []
    if y >= x:
      for i in range(x, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
    else:
      for i in range(x, 7):
        t.append(i)
      for j in range(0, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
  cleaned_df['weekend_stay'] = pd.DataFrame(fina,columns=['weekend_stay'])['weekend_stay'].values
  cleaned_df = cleaned_df.drop(dates, 1)

  eps = 0.001
  cleaned_df['Log Actual_cost'] = np.log(cleaned_df.pop('Actual_cost')+eps)
  cleaned_df['Log Room_Rate'] = np.log(cleaned_df.pop('Room_Rate')+eps)

  submission_features = scaler.transform(np.array(cleaned_df))
  submission_features = np.clip(submission_features, -5, 5)

  return submission_features

In [None]:
submission_features = prepare_submission_df(submission_df, one_hot_encoded_lst, dates, scaler)

In [None]:
def print_counts(pred):
  checkins = pred.argmax(axis = 1).tolist().count(0)
  canceled = pred.argmax(axis = 1).tolist().count(1)
  noshow = pred.argmax(axis = 1).tolist().count(2)
  print(f"Check-In Count :  {checkins}\nCanceled Count :  {canceled}\nNo Show Count :  {noshow}\n")

In [None]:
baseline_submission_pred = model.predict(submission_features, batch_size=BATCH_SIZE)
print_counts(baseline_submission_pred)

In [None]:
weighted_submission_pred = weighted_model.predict(submission_features, batch_size=BATCH_SIZE)
print_counts(weighted_submission_pred)

In [None]:
resampled_submission_pred = resampled_model.predict(submission_features, batch_size=BATCH_SIZE)
print_counts(resampled_submission_pred)

In [None]:
def create_submission_df(pred, df):
  col_drop = df.columns.tolist()
  submission = df.drop(col_drop, 1)
  predictions = pred.argmax(axis = 1).tolist()
  predictions = [x+1 for x in predictions]
  submission['Reservation_status'] = pd.DataFrame(predictions,columns=['Reservation_status'])['Reservation_status'].values
  return submission

In [None]:
create_submission_df(resampled_submission_pred, submission_df).to_csv('submission2-day2.csv')