# MSiA 420 - Predictive Analytics II - Final Project
## Group 6: Alejandra Lelo de Larrea Ibarra, Kiran Jyothi Sheena, Lixuan (Ellen) Chen, Wencheng Zhang

# Cleaning data

In [1]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns

In [102]:
# Read data
data = pd.read_csv("../02_Data/hotel_bookings.csv")
country = pd.read_csv("../02_Data/country_mapping.csv")

# Dimensions 
print("Dimensions: " + str(data.shape))

# Print header 
data.head()

Dimensions: (119390, 32)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [108]:
# --- CLEAN DATA ---
# Create clean dataframe as a copy of original data 
data_clean = data.copy()

# Create numerical column (arrival_month) representing the month
month_dict = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,
              'October':10,'November':11,'December':12}
#data_clean['arrival_month'] = data_clean['arrival_date_month']
data_clean['arrival_month'] = data_clean['arrival_date_month'].replace(month_dict)


# Create arrival_date column 
# Note: it might not be needed for regression, could be dropped later.
data_clean['arrival_date'] =data_clean['arrival_date_year'].astype('str') + "-" + \
                            data_clean['arrival_month'].astype('str') + "-" + \
                            data_clean['arrival_date_day_of_month'].astype('str')   
data_clean['arrival_date'] = pd.to_datetime(data_clean['arrival_date'])


# lead_time: take logs. -> log_lead_time
data_clean['log_lead_time'] = np.log(data_clean['lead_time']+1)

# Create a totalnights stayed 
data_clean['total_nights'] = data_clean['stays_in_weekend_nights'] + data_clean['stays_in_week_nights']

# Create total_nights_bucket for total nights stayed
night_bins = [0,6,11,16,21]
night_labels = ['A. 0-5','B. 6-10','C. 11-15','D. 16-20']
data_clean['total_nights_bucket'] = pd.cut(data_clean['total_nights'],bins= night_bins,labels=night_labels,
                                             right=False,include_lowest=True)
data_clean['total_nights_bucket'] = data_clean['total_nights_bucket'].astype('str')
data_clean['total_nights_bucket'] = data_clean['total_nights_bucket'].replace({'nan':'E. > 20'})

# ELLEN'S CLEANING HERE

# Avoiding chained assignment warnings
mine = data_clean.loc[:,['stays_in_week_nights', 'adults', 'children', 'babies', 
                         'meal', 'country', 'market_segment', 'distribution_channel']]

# code booking with missing children values as 0 children
data_clean['children'] = mine['children'].fillna(value = 0)

# adding a column "domestic" from counrty, if guest from Portugal, they are domestic
# Here bookings with unknown orgin are treated as not domestic (488 bookings)
data_clean['domestic'] = np.where(
    mine['country'] == 'PRT', 'domestic', 'international')

# Using a found country(ISO3166)-continent mapping table to assign continent/regions for the guests
# Source: https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv
data_clean = data_clean.merge(country[['alpha-3','region', 'sub-region']],
           how = 'left', left_on = 'country', right_on = 'alpha-3').drop(columns = 'alpha-3')

# rename region to continent, sub-region to sub_continent
data_clean = data_clean.rename(columns = {'region':'continent', 'sub-region':'sub_continent'})

# Some countries were not mapped using the table from the online source
data_clean[data_clean.continent.isna()].country.unique()

# Hard-coding these continents, CN is Asia-East Asia, TMP is Asia-Southest Asia, ATA is Antarctica-Antarctica
# China
data_clean['continent'], data_clean['sub_continent']  = np.where(
    (mine['country'] == 'CN'), "Asia", data_clean['continent']), np.where(
    (mine['country'] == 'CN'), "Eastern Asia", data_clean['sub_continent'])
# East Timor
data_clean['continent'], data_clean['sub_continent']  = np.where(
    (mine['country'] == 'TMP'), "Asia", data_clean['continent']), np.where(
    (mine['country'] == 'TMP'), "South-eastern Asia", data_clean['sub_continent'])
# Antarctica
data_clean['continent'], data_clean['sub_continent']  = np.where(
    (mine['country'] == 'ATA'), "Antarctica", data_clean['continent']), np.where(
    (mine['country'] == 'ATA'), "Antarctica", data_clean['sub_continent'])
# Missing Values
data_clean['continent'], data_clean['sub_continent'] = data_clean['continent'].fillna(value = 'unknown'), \
                                                            data_clean['sub_continent'].fillna(value = 'unknown')

# Wencheng's cleaning

# Kiran's cleaning
# company: change to dummy -> booked_by_company = 1 (if yes), 0 (if no)
data_clean["booked_by_company"] = np.where(data_clean['company'].isna(), 0, 1)

# days_in_waiting_list: take logs. -> log_days_in_waiting_list.
# NOTE: we could go back and remove outliers. 
data_clean["log_days_in_waiting_list"] = np.log(data_clean["days_in_waiting_list"]+1)

# adr: take logs, we find that 5400 and -6.36 are outliers, so we remove them 
data_clean = data_clean[data_clean["adr"]>= 0] # remove negative prices
data_clean = data_clean[data_clean["adr"] != 5400] # remove big price 
data_clean["log_adr"] = np.log(data_clean.adr+1)

# required_car_parking_spaces: change to dummy -> required_car_parking = 1 (if yes) 0 (if no)
data_clean["required_car_parking"] = np.where(data_clean['required_car_parking_spaces']>0, 1, 0)

In [104]:
data_clean

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,log_lead_time,total_nights,total_nights_bucket,domestic,continent,sub_continent,booked_by_company,log_days_in_waiting_list,log_adr,required_car_parking
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,5.837730,0,A. 0-5,1,Europe,Southern Europe,0,0.0,0.000000,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,6.603944,0,A. 0-5,1,Europe,Southern Europe,0,0.0,0.000000,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,2.079442,1,A. 0-5,0,Europe,Northern Europe,0,0.0,4.330733,0
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,2.639057,1,A. 0-5,0,Europe,Northern Europe,0,0.0,4.330733,0
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,2.708050,2,A. 0-5,0,Europe,Northern Europe,0,0.0,4.595120,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,3.178054,7,B. 6-10,0,Europe,Western Europe,0,0.0,4.576153,0
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,4.634729,7,B. 6-10,0,Europe,Western Europe,0,0.0,5.422436,0
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,3.555348,7,B. 6-10,0,Europe,Western Europe,0,0.0,5.067079,0
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,4.700480,7,B. 6-10,0,Europe,Northern Europe,0,0.0,4.657763,0


In [105]:
# --- Remove old columns and useless columns  ---
# Variables to remove
cols_rm = ["company",
           "arrival_date_month",'arrival_date_week_number','arrival_date_day_of_month',"lead_time",
           'stays_in_week_nights','stays_in_weekend_nights', 'total_nights_bucket', #total_nights_bucekt can be replaced by total_nights
           'country', 
           #"WENCHENG'S COLUMNS TO BE REMOVED"
           "days_in_waiting_list", "adr", "reservation_status", "reservation_status_date", "required_car_parking_spaces"]

# Drop columns
data_clean.drop(cols_rm, axis = 1, inplace = True)

In [109]:
# --- Fix data types --- 
# Categorical variables
categorical_vars = ["is_canceled", 
                    "arrival_month",# add total_nights_bucket if not removed
                    "meal", "market_segment", "distribution_channel", "continent", "sub_continent", 'domestic',
                    #"WHENCHENG'S CATEGORICAL VARIABLES HERE",
                    "customer_type", "booked_by_company", "required_car_parking"]

# Convert to category type
data_clean[categorical_vars] = data_clean[categorical_vars].astype("category")

# Check datatypes
data_clean.dtypes

hotel                                     object
is_canceled                             category
lead_time                                  int64
arrival_date_year                          int64
arrival_date_month                        object
arrival_date_week_number                   int64
arrival_date_day_of_month                  int64
stays_in_weekend_nights                    int64
stays_in_week_nights                       int64
adults                                     int64
children                                 float64
babies                                     int64
meal                                    category
country                                   object
market_segment                          category
distribution_channel                    category
is_repeated_guest                          int64
previous_cancellations                     int64
previous_bookings_not_canceled             int64
reserved_room_type                        object
assigned_room_type  

In [21]:
# Save clean data as csv 
data_clean.to_csv("../02_Data/hotel_bookings_clean.csv", index = False)