In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Reading our data

In [None]:
bookings_data = pd.read_csv('./data/hotel_bookings.csv')

In [None]:
bookings_data.head(5)

In [None]:
bookings_data.columns

**!NB** First, understand the data you have. Find any available metadata.

# Let's have another look at defining a problem...

We discussed that before starting our data storytelling journey, we have to understand what problem we are trying to solve.

However, sometimes problems are discovered only after we have **explored** our data - a process called **Exploratory Data Analysis (EDA)**.

# Exploratory Data Analysis

Out dataset consists of data from two different hotels located in Portugal --> one Resort hotel and one City hotel.

The data contains "bookings **due to arrive** between the 1st of July of 2015 and the 31st of August 2017".

In [None]:
bookings_data.hotel.unique()

In [None]:
bookings_data.arrival_date_year.unique()

In [None]:
bookings_data[bookings_data['arrival_date_year'] == 2015]['arrival_date_month'].unique()

In [None]:
bookings_data[bookings_data['arrival_date_year'] == 2017]['arrival_date_month'].unique()

## 1. Where do our guest come from?

In [None]:
customers_by_country = pd.DataFrame(bookings_data.country.value_counts().sort_values(ascending=False))
customers_by_country.rename(columns={'country': 'count'}, inplace=True)
total_guests = customers_by_country['count'].sum()
customers_by_country["guest_%"] = round(customers_by_country["count"] / total_guests * 100, 2)
customers_by_country["country"] = customers_by_country.index

In [None]:
#bookings_data[bookings_data['is_canceled'] == 0]

In [None]:
customers_by_country

In [None]:
customers_by_country['count']

In [None]:
plt.pie(x=customers_by_country['count'],
       labels=customers_by_country['country'])    
plt.show()

Always optimize your piecharts!

In [None]:
top_5_countries = customers_by_country.head()

In [None]:
plt.pie(x=top_5_countries['count'],
       labels=top_5_countries['country'],
       autopct='%1.1f%%')
plt.title('Customers by country')
plt.show()

Be very careful with pie charts! They might seem correct, but they might not be...

There is one more problem --> remember that these are the results for all the bookings --> let's remove the cancelled ones.

## 2. How much do guests pay for a room per night?

Be careful! Take into account the following:
- we have two different hotels
- there is seasonality in the data

In [None]:
bookings_data.adults
bookings_data.children
bookings_data.babies

In [None]:
pd.set_option('use_inf_as_na', True)
resort_hotel_data = bookings_data.loc[(bookings_data['hotel'] == 'Resort Hotel') & (bookings_data['is_canceled'] == 0)]
city_hotel_data = bookings_data.loc[(bookings_data['hotel'] == 'City Hotel') & (bookings_data['is_canceled'] == 0)]

In [None]:
#adr == average daily rate

In [None]:
resort_hotel_data['adr_per_person'] = resort_hotel_data['adr'] / (resort_hotel_data['adults'] + resort_hotel_data['children'])
city_hotel_data['adr_per_person'] = city_hotel_data['adr'] / (city_hotel_data['adults'] + city_hotel_data['children'])

In [None]:
resort_hotel_data.adr_per_person.mean(), city_hotel_data.adr_per_person.mean()

In [None]:
resort_hotel_data.adr_per_person.std(), city_hotel_data.adr_per_person.std()

## 3. How many cancelations do we have?

In [None]:
rh_cancelations = bookings_data[bookings_data.hotel=='Resort Hotel']['is_canceled'].value_counts()
rh_cancelations

In [None]:
ch_cancelations = bookings_data[bookings_data.hotel=='City Hotel']['is_canceled'].value_counts()

In [None]:
rh_cancelations.values[1] / rh_cancelations.values[0]

In [None]:
ch_cancelations.values[1] / ch_cancelations.values[0]

## 4. How does deposits affect cancelation?

In [None]:
bookings_data.drop(bookings_data[bookings_data.adr > 1000].index.values, inplace=True)

In [None]:
deposit_cancel_data = bookings_data.groupby("deposit_type")["is_canceled"].describe()

In [None]:
deposit_cancel_data

In [None]:
plt.bar(deposit_cancel_data.index, deposit_cancel_data["mean"] * 100)
plt.show()

## 5. How does average daily rate affect cancelation?

In [None]:
adr_cancel_data = bookings_data.groupby("adr")["is_canceled"].describe()

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(adr_cancel_data.index, adr_cancel_data["mean"] * 100)
plt.show()