# DiCE Model in Hotel Advertising - Notebook

In [6]:
# Import Packages
import dice_ml
from dice_ml.utils import helpers # helper functions
from sklearn.model_selection import train_test_split

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Read Data
df = pd.read_csv("hotel_bookings.csv")

In [8]:
# Modify the outcome variable
df['not_canceled'] = 1 - df['is_canceled']

# Drop columns that are not useful

useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes',
               'reservation_status', 'country', 'days_in_waiting_list', "reservation_status_date", "agent", "company", 
               "babies", "market_segment", "distribution_channel", 'is_canceled']

df.drop(useless_col, axis = 1, inplace = True)

In [9]:
df

Unnamed: 0,hotel,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,not_canceled
0,Resort Hotel,342,July,27,1,0,0,2,0.0,BB,0,0,0,C,No Deposit,Transient,0.00,0,0,1
1,Resort Hotel,737,July,27,1,0,0,2,0.0,BB,0,0,0,C,No Deposit,Transient,0.00,0,0,1
2,Resort Hotel,7,July,27,1,0,1,1,0.0,BB,0,0,0,A,No Deposit,Transient,75.00,0,0,1
3,Resort Hotel,13,July,27,1,0,1,1,0.0,BB,0,0,0,A,No Deposit,Transient,75.00,0,0,1
4,Resort Hotel,14,July,27,1,0,2,2,0.0,BB,0,0,0,A,No Deposit,Transient,98.00,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,23,August,35,30,2,5,2,0.0,BB,0,0,0,A,No Deposit,Transient,96.14,0,0,1
119386,City Hotel,102,August,35,31,2,5,3,0.0,BB,0,0,0,E,No Deposit,Transient,225.43,0,2,1
119387,City Hotel,34,August,35,31,2,5,2,0.0,BB,0,0,0,D,No Deposit,Transient,157.71,0,4,1
119388,City Hotel,109,August,35,31,2,5,2,0.0,BB,0,0,0,A,No Deposit,Transient,104.40,0,0,1


In [10]:
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.
# rest schould be self-explanatory.
nan_replacements = {"children:": 0.0}
df_cln = df.fillna(nan_replacements)

# "meal" contains values "Undefined", which is equal to SC.
df_cln["meal"].replace("Undefined", "SC", inplace=True)

# Some rows contain entreis with 0 adults, 0 children and 0 babies. 
# I'm dropping these entries with no guests.
zero_guests = list(df_cln.loc[df_cln["adults"]
                   + df_cln["children"]==0].index)
df_cln.drop(df_cln.index[zero_guests], inplace=True)

In [11]:
# How much data is left?
df = df_cln
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119210 entries, 0 to 119389
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119210 non-null  object 
 1   lead_time                       119210 non-null  int64  
 2   arrival_date_month              119210 non-null  object 
 3   arrival_date_week_number        119210 non-null  int64  
 4   arrival_date_day_of_month       119210 non-null  int64  
 5   stays_in_weekend_nights         119210 non-null  int64  
 6   stays_in_week_nights            119210 non-null  int64  
 7   adults                          119210 non-null  int64  
 8   children                        119206 non-null  float64
 9   meal                            119210 non-null  object 
 10  is_repeated_guest               119210 non-null  int64  
 11  previous_cancellations          119210 non-null  int64  
 12  previous_booking

In [12]:
def analyze_dataset(df):
    df = df.drop(columns=['not_canceled'])
    numeric_vars = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_vars = df.select_dtypes(include=['object']).columns.tolist()

    numeric_info = {}
    for var in numeric_vars:
        numeric_info[var] = {
            "min": df[var].min(),
            "max": df[var].max()
        }

    categorical_info = {}
    for var in categorical_vars:
        categorical_info[var] = df[var].unique().tolist()

    return numeric_vars, categorical_vars, numeric_info, categorical_info


In [15]:
numeric_vars, categorical_vars, numeric_info, categorical_info = analyze_dataset(df)

numeric_vars

['lead_time',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests']

In [14]:
numeric_vars

['lead_time',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests']

In [11]:
categorical_vars

['hotel',
 'arrival_date_month',
 'meal',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

In [12]:
numeric_info

{'lead_time': {'min': 0, 'max': 737},
 'arrival_date_week_number': {'min': 1, 'max': 53},
 'arrival_date_day_of_month': {'min': 1, 'max': 31},
 'stays_in_weekend_nights': {'min': 0, 'max': 19},
 'stays_in_week_nights': {'min': 0, 'max': 50},
 'adults': {'min': 0, 'max': 55},
 'children': {'min': 0.0, 'max': 10.0},
 'is_repeated_guest': {'min': 0, 'max': 1},
 'previous_cancellations': {'min': 0, 'max': 26},
 'previous_bookings_not_canceled': {'min': 0, 'max': 72},
 'adr': {'min': -6.38, 'max': 5400.0},
 'required_car_parking_spaces': {'min': 0, 'max': 8},
 'total_of_special_requests': {'min': 0, 'max': 5}}

In [13]:
categorical_info

{'hotel': ['Resort Hotel', 'City Hotel'],
 'arrival_date_month': ['July',
  'August',
  'September',
  'October',
  'November',
  'December',
  'January',
  'February',
  'March',
  'April',
  'May',
  'June'],
 'meal': ['BB', 'FB', 'HB', 'SC'],
 'reserved_room_type': ['C', 'A', 'D', 'E', 'G', 'F', 'H', 'L', 'B'],
 'deposit_type': ['No Deposit', 'Refundable', 'Non Refund'],
 'customer_type': ['Transient', 'Contract', 'Transient-Party', 'Group']}

In [16]:
target = df["not_canceled"]
train_dataset, test_dataset, y_train, y_test = train_test_split(df,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)
x_train = train_dataset.drop("not_canceled", axis=1)
x_test = test_dataset.drop("not_canceled", axis=1)

train_dataset


Unnamed: 0,hotel,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,not_canceled
29378,Resort Hotel,58,October,44,23,2,2,2,0.0,BB,0,0,0,A,No Deposit,Transient,46.00,0,0,1
83100,City Hotel,0,January,4,18,1,1,1,0.0,BB,0,0,0,A,No Deposit,Transient,64.50,0,0,1
55183,City Hotel,78,August,33,7,1,0,2,0.0,BB,0,0,0,A,No Deposit,Transient,117.90,0,0,0
13553,Resort Hotel,129,August,33,19,2,5,1,0.0,HB,0,0,0,A,No Deposit,Transient-Party,192.80,0,0,0
35166,Resort Hotel,351,April,14,8,2,5,2,0.0,HB,0,0,0,D,No Deposit,Transient-Party,84.00,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112946,City Hotel,144,May,22,28,2,1,2,0.0,BB,0,0,0,A,No Deposit,Transient,130.67,0,0,1
108736,City Hotel,57,March,13,29,0,3,2,0.0,BB,0,0,0,A,No Deposit,Transient,97.20,0,1,1
71445,City Hotel,315,July,27,5,2,4,2,0.0,BB,0,0,0,A,No Deposit,Transient,103.50,0,1,0
13683,Resort Hotel,181,August,34,26,2,2,2,2.0,BB,0,0,0,C,No Deposit,Transient,220.00,0,1,0


In [17]:
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numeric_vars, outcome_name="not_canceled")

In [18]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_vars)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [19]:
# Using sklearn backend
m = dice_ml.Model(model=model, backend="sklearn")
# Using method=random for generating CFs
exp = dice_ml.Dice(d, m, method="random")

In [21]:
e1 = exp.generate_counterfactuals(x_test[0:1], total_CFs=5, desired_class="opposite")
e1.visualize_as_dataframe(show_only_changes=True)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  3.88it/s]

Query instance (original outcome : 0)





Unnamed: 0,hotel,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,not_canceled
0,City Hotel,16,November,45,4,0,2,1,0.0,BB,0,1,0,A,Non Refund,Transient,95.0,0,0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,hotel,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,not_canceled
0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,No Deposit,-,340.5,-,-,1
1,-,-,-,7,-,-,-,-,-,-,-,-,-,-,No Deposit,-,-,-,-,1
2,-,-,-,-,-,-,23,-,-,-,-,-,-,-,No Deposit,-,-,-,-,1
3,-,-,-,41,-,-,-,-,-,-,-,-,-,-,No Deposit,-,-,-,-,1
4,-,-,-,-,-,-,-,-,-,-,-,-,-,-,No Deposit,-,-,-,5,1
