# Feature Engineering

In [None]:
# import library
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='darkgrid')

In [None]:
# load data
hotel = pd.read_csv('hotel_bookings_no_missing.csv',index_col='Unnamed: 0')

## Feature Engineering

In [None]:
# to check the feature types
hotel.dtypes

## Handling categorical features

In [None]:
# change hotels to 0 and 1 (City Hotel: 1, Resort Hotel: 0)
hotel['hotel'] = hotel.hotel.replace(['Resort Hotel','City Hotel'],[0,1])

In [None]:
# create one-hot variavles for years
year_dummy = pd.get_dummies(hotel['arrival_date_year'])

In [None]:
month=['January', 'February', 'March', 'April', 'May', 'June', 'July', 
       'August', 'September', 'October', 'November', 'December']
# change arrival date month to 1 to 12
hotel['arrival_date_month'] = hotel.arrival_date_month.replace(month,np.arange(1,13))

# group months into seasons
spring = [3,4,5]
summer = [6,7,8]
fall = [9,10,11]
winter = [1,2,12]
hotel['seasons'] = hotel['arrival_date_month'].map(lambda i: "spring" if i in spring 
                                                   else "summer" if i in summer 
                                                   else "fall" if i in fall 
                                                   else "winter")

# bar chart for seasons
plt.figure(figsize=(20,10))
seasons = ['spring','summer','fall','winter']
sns.countplot(x = 'seasons', hue = 'is_canceled', data=hotel, order=seasons)

In [None]:
# create one-hot variavles for seasons
season_dummy = pd.get_dummies(hotel['seasons'])

In [None]:
# create one-hot variavles for meal
meal_dummy = pd.get_dummies(hotel['meal'],prefix ='meal')

In [None]:
# Finding the most predictive country with >80% majority label and more than 5000 entries

# Groupby country and find how many cancelled and not cancelled each country has
country_count = hotel.groupby(['country','is_canceled'])['is_canceled'].size().reset_index(name='count_cancelled')
# hotelcountrysum= hotel.groupby('country')['is_canceled'].agg(pd.Series.sum).reset_index(name='sum') 
country_sum = hotel.groupby('country')['is_canceled'].count().reset_index(name='sum')
# left join the sum and count
country_joined= country_count.merge(country_sum, on='country', how='left')
# finding the percentage of each label in a country
country_joined['percentage'] = country_joined.apply(lambda x: x['count_cancelled']/x['sum'], axis=1)
# remove all countires whose majority label is less than 80%
country_joined = country_joined[country_joined['percentage'] > 0.8]
# remove all countries whose sum is less than 5000
country_joined = country_joined[country_joined['sum'] > 5000]

# most predictive countries. With >80% majority label and more than 5000 entries
country_joined

In [None]:
# FRA and DEU are the most predictive countries so they would be the dummy variables
hotel['country_is_FRA'] = hotel['country'].apply(lambda x: 1 if x == 'FRA' else 0)
hotel['country_is_DEU'] = hotel['country'].apply(lambda x: 1 if x == 'DEU' else 0)
hotel['country_is_other'] = hotel['country'].apply(lambda x: 1 if (x != 'DEU' and x!='FRA') else 0)

In [None]:
# create one-hot variavles for market_segment
market_segment_dummy = pd.get_dummies(hotel['market_segment'],prefix ='market_segment')

In [None]:
# create one-hot variavles for distribution_channel
distribution_channel_dummy = pd.get_dummies(hotel['distribution_channel'],prefix ='distribution_channel')

In [None]:
# create one-hot variavles for deposit_type
deposit_type_dummy = pd.get_dummies(hotel['deposit_type'],prefix='deposit_type')

In [None]:
# Hypothesis: If reserved room type matches assigned room type, i.e. reserved_assigned_room_type_match = 1, it is more likely for the customer to NOT cancel the booking.
# check to see if reserved room type matches assigned room type
hotel['reserved_assigned_room_type_match'] = (hotel['reserved_room_type'] == hotel['assigned_room_type'])
match_dummy = pd.get_dummies(hotel['reserved_assigned_room_type_match'],prefix ='reserved_assigned_room_type_match')
hotel['reserved_assigned_room_type_match'] = match_dummy['reserved_assigned_room_type_match_True']

In [None]:
# create one-hot variavles for customer_type
cus_type_dummy = pd.get_dummies(hotel['customer_type'],prefix ='customer_type')

In [None]:
# adding dummy variables into the dataset
hotel = pd.concat([hotel,
                   year_dummy,
                   season_dummy,
                   distribution_channel_dummy,
                   market_segment_dummy,
                   meal_dummy,
                   deposit_type_dummy,
                   cus_type_dummy
                  ],
                  axis=1,sort=True)

In [None]:
# feature - agent
hotel.groupby('is_canceled')['agent'].value_counts()

# Group by is_canceled and find the most frequent agent for each is_canceled
hotel.groupby('is_canceled')['agent'].agg(pd.Series.mode)

# Groupby agent and find how many cancelled and not cancelled each agent has
hotelAgentCount = hotel.groupby(['agent','is_canceled'])['is_canceled'].size().reset_index(name='count_cancelled')
hotelAgentSum = hotel.groupby('agent')['is_canceled'].count().reset_index(name='sum')
# left join the sum and count
hotelAgentJoined= hotelAgentCount.merge(hotelAgentSum, on='agent', how='left')

# finding the percentage of each label in an agent
hotelAgentJoined['percentage'] = hotelAgentJoined.apply(lambda x: x['count_cancelled']/x['sum'], axis=1)

# all agents whose majority label is more than 80% and sum is more than 3000
hotelAgentJoined80 = hotelAgentJoined[hotelAgentJoined['percentage'] > 0.8]
hotelAgentJoined3000 = hotelAgentJoined80[hotelAgentJoined80['sum'] > 3000]
hotelAgentJoined3000

In [None]:
# agent 7 and 14 are the most predictive agents so they would be the dummy variables
hotel['agent_14'] = hotel['agent'].apply(lambda x: 1 if x == 14.0 else 0)
hotel['agent_7'] = hotel['agent'].apply(lambda x: 1 if x == 7.0 else 0)
hotel['agent_is_other'] = hotel['agent'].apply(lambda x: 1 if (x != 14.0 and x!=7.0) else 0)

In [None]:
sns.barplot(x='agent_14',y='is_canceled',data=hotel)

In [None]:
sns.barplot(x='agent_7',y='is_canceled',data=hotel)

In [None]:
sns.barplot(x='agent_is_other',y='is_canceled',data=hotel)

## Handling numerical features

In [None]:
from sklearn import preprocessing

**lead_time**

In [None]:
hotel['lead_time'].skew()

In [None]:
hotel['log_lead'] = hotel['lead_time'].map(lambda i: np.log(i) if i > 0 else 0)
hotel['log_lead'].skew()

Perform `log( )` function on each lead time to scale the data.  
After outliers handling:

In [None]:
plt.figure(figsize = (20,10))
sns.boxplot(hotel['log_lead'])

**stays_in_weekend_nights**

In [None]:
hotel['stays_in_weekend_nights'] = np.where(hotel['stays_in_weekend_nights'] > 5, 5, hotel['stays_in_weekend_nights'])

Data clipping: for each `stays_in_weekend_nights`, if it is larger than 5, change it to 5.  
After outliers handling:

In [None]:
plt.figure(figsize = (20,10))
sns.boxplot(hotel['stays_in_weekend_nights'])

In [None]:
plt.figure(figsize = (20,10))
sns.countplot(hotel['stays_in_weekend_nights'])

**stays_in_week_nights**

In [None]:
hotel['stays_in_week_nights'] = np.where(hotel['stays_in_week_nights'] > 6, 6, hotel['stays_in_week_nights'])

Data clipping: for each `stays_in_weekend_nights`, if it is larger than 5, change it to 5.  
After outliers handling:

In [None]:
plt.figure(figsize = (20,10))
sns.boxplot(hotel['stays_in_week_nights'])

In [None]:
plt.figure(figsize = (20,10))
sns.countplot(hotel['stays_in_week_nights'])

**stays_in_total_nights**

In [None]:
# create a column indicates the total nights each customers stay
hotel['stays_in_total_nights'] = hotel['stays_in_weekend_nights'] + hotel['stays_in_week_nights']

In [None]:
plt.figure(figsize = (20,10))
sns.countplot(hotel['stays_in_total_nights'])

**adr**

In [None]:
mean_adr = hotel['adr'].mean()
std_adr = hotel['adr'].std()
hotel['adr'] = np.where(hotel['adr'] > (mean_adr+10*std_adr), mean_adr, hotel['adr'])

Changing the outliers of `adr` to the data mean.

### Binning

In [None]:
# Put previous_cancellations into 4 categories
cut_labels_4 = ['0','1-10', '11-20', '21+']
cut_bins = [0, 0.1, 10, 20, 30]
hotel['cut_previous_cancellations'] = pd.cut(hotel['previous_cancellations'], include_lowest=True, bins=cut_bins, labels=cut_labels_4)
hotel.groupby('cut_previous_cancellations')['is_canceled'].value_counts()

In [None]:
sns.barplot(x='cut_previous_cancellations',y='is_canceled',data=hotel)

In [None]:
# Put previous_bookings_not_canceled into 5 categories
cut_labels_5 = ['0','1-20', '21-40', '41-60', '60+']
cut_bins_previous_bookings_not_canceled = [0, 0.1, 20, 40, 60, 81]
hotel['cut_previous_bookings_not_canceled'] = pd.cut(hotel['previous_bookings_not_canceled'], include_lowest=True, bins=cut_bins_previous_bookings_not_canceled, labels=cut_labels_5)
hotel.groupby('cut_previous_bookings_not_canceled')['is_canceled'].value_counts()

In [None]:
sns.barplot(x='cut_previous_bookings_not_canceled',y='is_canceled',data=hotel)

In [None]:
# Put booking_changes into 6 categories
cut_labels_6 = ['0','1-5', '6-10', '11-15', '16-20','21+']
cut_bins_booking_changes = [0, 0.1, 5, 10, 15, 20, 26]
hotel['cut_booking_changes'] = pd.cut(hotel['booking_changes'], bins=cut_bins_booking_changes, include_lowest=True, labels=cut_labels_6)
hotel.groupby('cut_booking_changes')['is_canceled'].value_counts()

In [None]:
sns.barplot(x='cut_booking_changes',y='is_canceled',data=hotel)

### Normalization

In [None]:
# log_lead
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['log_lead']])
hotel['log_lead_minmax']=minmax_scaler.transform(hotel[['log_lead']])

# stays_in_weekend_nights
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['stays_in_weekend_nights']])
hotel['stays_in_weekend_nights_minmax']=minmax_scaler.transform(hotel[['stays_in_weekend_nights']])

# stays_in_week_nights
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['stays_in_week_nights']])
hotel['stays_in_week_nights_minmax']=minmax_scaler.transform(hotel[['stays_in_week_nights']])

# stays_in_total_nights
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['stays_in_total_nights']])
hotel['stays_in_total_nights_minmax']=minmax_scaler.transform(hotel[['stays_in_total_nights']])

# adults
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['adults']])
hotel['adults_minmax']=minmax_scaler.transform(hotel[['adults']])

# children
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['children']])
hotel['children_minmax']=minmax_scaler.transform(hotel[['children']])

# babies
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['babies']])
hotel['babies_minmax']=minmax_scaler.transform(hotel[['babies']])

# days_in_waiting_list
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['days_in_waiting_list']])
hotel['days_in_waiting_list_minmax']=minmax_scaler.transform(hotel[['days_in_waiting_list']])

# adr
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['adr']])
hotel['adr_minmax']=minmax_scaler.transform(hotel[['adr']])

# required_car_parking_spaces
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['required_car_parking_spaces']])
hotel['required_car_parking_spaces_minmax']=minmax_scaler.transform(hotel[['required_car_parking_spaces']])

# total_of_special_requests
minmax_scaler = preprocessing.MinMaxScaler().fit(hotel[['total_of_special_requests']])
hotel['total_of_special_requests_minmax']=minmax_scaler.transform(hotel[['total_of_special_requests']])


### Dropping out features

In [None]:
# drop out 'arrival_date_day_of_month' since it looks random
cat_delete_columns = ['arrival_date_year',
                      'arrival_date_month',
                      'meal',
                      'seasons',
                      'distribution_channel',
                      'market_segment',
                      'agent',
                      'customer_type',
                      'country',
                      'deposit_type',
                      'deposit_type_Refundable',
                      'customer_type']

num_delete_columns = ['lead_time',
                      'stays_in_weekend_nights',
                      'stays_in_week_nights',
                      'stays_in_total_nights',
                      'adults',
                      'children',
                      'babies',
                      'previous_cancellations',
                      'previous_bookings_not_canceled',
                      'booking_changes',
                      'days_in_waiting_list',
                      'adr',
                      'required_car_parking_spaces',
                      'total_of_special_requests'
                     ]

delete_columns = cat_delete_columns + num_delete_columns

hoteldrop = hotel.drop(columns=delete_columns)

In [None]:
hoteldrop.info()

In [None]:
# export the clean dataset to a new csv file
hoteldrop.to_csv("clean_hotel_bookings.csv")

## Correlations (after feature engineering)

In [None]:
# look at the correlations among columns
data = hoteldrop.copy()
data.corr()

In [None]:
# look at the correlations with is_canceled
data.corr()['is_canceled'].sort_values()