# Capstone Part Two - Data Cleaning

## Checking all Data types are in order and ensure no duplicates

In [1]:
import pandas as pd
import numpy as np

In [2]:
combinedhotel_df = pd.read_csv('combinedhotel.csv')

In [3]:
print(f"the dataset has {combinedhotel_df.shape[0]} rows and {combinedhotel_df.shape[1]} columns")

the dataset has 118902 rows and 32 columns


In [4]:
combinedhotel_df.head()

Unnamed: 0,hotel,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,resort hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,No Agent,Not Company,0,Transient,0.0,0,0,Check-Out,01/07/2015
1,resort hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,No Agent,Not Company,0,Transient,0.0,0,0,Check-Out,01/07/2015
2,resort hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,No Agent,Not Company,0,Transient,75.0,0,0,Check-Out,02/07/2015
3,resort hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304,Not Company,0,Transient,75.0,0,0,Check-Out,02/07/2015
4,resort hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240,Not Company,0,Transient,98.0,0,1,Check-Out,03/07/2015


Confirming that dataset has no nulls

In [5]:
combinedhotel_df.isnull().sum()

hotel                          0
IsCanceled                     0
LeadTime                       0
ArrivalDateYear                0
ArrivalDateMonth               0
ArrivalDateWeekNumber          0
ArrivalDateDayOfMonth          0
StaysInWeekendNights           0
StaysInWeekNights              0
Adults                         0
Children                       0
Babies                         0
Meal                           0
Country                        0
MarketSegment                  0
DistributionChannel            0
IsRepeatedGuest                0
PreviousCancellations          0
PreviousBookingsNotCanceled    0
ReservedRoomType               0
AssignedRoomType               0
BookingChanges                 0
DepositType                    0
Agent                          0
Company                        0
DaysInWaitingList              0
CustomerType                   0
ADR                            0
RequiredCarParkingSpaces       0
TotalOfSpecialRequests         0
Reservatio

## Datatype checks

identifying boolean columns

In [6]:
binary_columns = ['IsCanceled', 'IsRepeatedGuest']

casting boolean columns as boolean

In [7]:
combinedhotel_df[binary_columns] = combinedhotel_df[binary_columns].astype(bool)

In [8]:
combinedhotel_df[binary_columns].dtypes

IsCanceled         bool
IsRepeatedGuest    bool
dtype: object

In [9]:
numerical_columns = ['LeadTime', 'StaysInWeekendNights','StaysInWeekNights','Adults','Children',
                   'Babies','PreviousCancellations','PreviousBookingsNotCanceled', 'BookingChanges',
                   'DaysInWaitingList','ADR','RequiredCarParkingSpaces','TotalOfSpecialRequests']

In [10]:
combinedhotel_df[numerical_columns].dtypes

LeadTime                         int64
StaysInWeekendNights             int64
StaysInWeekNights                int64
Adults                           int64
Children                         int64
Babies                           int64
PreviousCancellations            int64
PreviousBookingsNotCanceled      int64
BookingChanges                   int64
DaysInWaitingList                int64
ADR                            float64
RequiredCarParkingSpaces         int64
TotalOfSpecialRequests           int64
dtype: object

Casting datetime columns as datetime

In [11]:
combinedhotel_df.columns

Index(['hotel', 'IsCanceled', 'LeadTime', 'ArrivalDateYear',
       'ArrivalDateMonth', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
       'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel',
       'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
       'BookingChanges', 'DepositType', 'Agent', 'Company',
       'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
       'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate'],
      dtype='object')

In [12]:
categorical_columns = ['Meal', 'Country', 'MarketSegment', 'DistributionChannel',
                       'ReservedRoomType', 'AssignedRoomType','DepositType', 'Agent', 'Company',
                       'CustomerType','ReservationStatus']

In [13]:
combinedhotel_df[categorical_columns].dtypes

Meal                   object
Country                object
MarketSegment          object
DistributionChannel    object
ReservedRoomType       object
AssignedRoomType       object
DepositType            object
Agent                  object
Company                object
CustomerType           object
ReservationStatus      object
dtype: object

In [14]:
date_not_datetime_columns = ['ArrivalDateYear','ArrivalDateMonth','ArrivalDateWeekNumber','ArrivalDateDayOfMonth']

In [15]:
combinedhotel_df[date_not_datetime_columns].dtypes

ArrivalDateYear           int64
ArrivalDateMonth         object
ArrivalDateWeekNumber     int64
ArrivalDateDayOfMonth     int64
dtype: object

In [16]:
combinedhotel_df.dtypes

hotel                           object
IsCanceled                        bool
LeadTime                         int64
ArrivalDateYear                  int64
ArrivalDateMonth                object
ArrivalDateWeekNumber            int64
ArrivalDateDayOfMonth            int64
StaysInWeekendNights             int64
StaysInWeekNights                int64
Adults                           int64
Children                         int64
Babies                           int64
Meal                            object
Country                         object
MarketSegment                   object
DistributionChannel             object
IsRepeatedGuest                   bool
PreviousCancellations            int64
PreviousBookingsNotCanceled      int64
ReservedRoomType                object
AssignedRoomType                object
BookingChanges                   int64
DepositType                     object
Agent                           object
Company                         object
DaysInWaitingList        

New 'ArrivalDate' column to capture full date

In [17]:
combinedhotel_df['ArrivalDate']= pd.to_datetime(
    combinedhotel_df['ArrivalDateDayOfMonth'].astype(str) + '/' +
    combinedhotel_df['ArrivalDateMonth'].astype(str) + '/' +
    combinedhotel_df['ArrivalDateYear'].astype(str),
    format='%d/%B/%Y'
)

In [18]:
datetime_columns = ['ReservationStatusDate','ArrivalDate']

In [19]:
combinedhotel_df['ReservationStatusDate'] = pd.to_datetime(combinedhotel_df['ReservationStatusDate'], format='%d/%m/%Y')

Inspecting data types 

In [20]:
combinedhotel_df.dtypes

hotel                                  object
IsCanceled                               bool
LeadTime                                int64
ArrivalDateYear                         int64
ArrivalDateMonth                       object
ArrivalDateWeekNumber                   int64
ArrivalDateDayOfMonth                   int64
StaysInWeekendNights                    int64
StaysInWeekNights                       int64
Adults                                  int64
Children                                int64
Babies                                  int64
Meal                                   object
Country                                object
MarketSegment                          object
DistributionChannel                    object
IsRepeatedGuest                          bool
PreviousCancellations                   int64
PreviousBookingsNotCanceled             int64
ReservedRoomType                       object
AssignedRoomType                       object
BookingChanges                    

In [21]:
# rearranging columns for target feature 'IsCanceld' to be the last column
combinedhotel_df = combinedhotel_df[[col for col in combinedhotel_df.columns if col != 'IsCanceled']+['IsCanceled']]

In [22]:
# inspect dataframe 
combinedhotel_df.sample(5)

Unnamed: 0,hotel,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,ArrivalDate,IsCanceled
6393,resort hotel,58,2016,June,24,7,2,8,2,0,...,Not Company,0,Transient,119.0,0,0,Canceled,2016-04-10,2016-06-07,True
42796,city hotel,56,2015,September,38,17,0,2,2,0,...,Not Company,0,Transient-Party,82.0,0,0,Check-Out,2015-09-19,2015-09-17,False
94542,city hotel,180,2016,August,33,10,0,4,3,0,...,Not Company,0,Transient,130.05,0,1,Check-Out,2016-08-14,2016-08-10,False
66420,city hotel,156,2017,April,17,26,0,3,2,0,...,Not Company,0,Transient,100.0,0,0,Canceled,2016-11-21,2017-04-26,True
46248,city hotel,9,2016,January,4,17,2,0,2,0,...,Not Company,0,Transient,77.25,0,0,Canceled,2016-01-14,2016-01-17,True


## Checking for Duplication

In [23]:
# check number of duplicated rows
print (f' there are {combinedhotel_df.duplicated().sum()} duplicated rows')

 there are 31958 duplicated rows


In [24]:
combinedhotel_df[combinedhotel_df.duplicated()]['IsCanceled'].value_counts()

IsCanceled
True     20166
False    11792
Name: count, dtype: int64

In [25]:
combinedhotel_df.IsCanceled.value_counts()

IsCanceled
False    74745
True     44157
Name: count, dtype: int64

In [26]:
combinedhotel_df.columns

Index(['hotel', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth',
       'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
       'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel',
       'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
       'BookingChanges', 'DepositType', 'Agent', 'Company',
       'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
       'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate',
       'ArrivalDate', 'IsCanceled'],
      dtype='object')

In [28]:
# checking sample of duplicated rows

combinedhotel_df[combinedhotel_df.duplicated()].sample(5)

Unnamed: 0,hotel,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,ArrivalDate,IsCanceled
6266,resort hotel,102,2016,May,23,31,2,5,2,0,...,Not Company,0,Transient,80.0,0,0,Canceled,2016-04-14,2016-05-31,True
32418,resort hotel,36,2017,February,5,4,2,2,2,0,...,Not Company,0,Transient-Party,39.6,0,3,Check-Out,2017-02-08,2017-02-04,False
88911,city hotel,198,2016,May,21,20,0,1,2,0,...,Not Company,98,Transient-Party,65.0,0,0,Check-Out,2016-05-21,2016-05-20,False
55459,city hotel,412,2016,August,35,25,0,2,2,0,...,Not Company,0,Transient,62.0,0,0,Canceled,2015-10-21,2016-08-25,True
112500,city hotel,11,2017,May,22,29,1,2,1,0,...,Not Company,0,Transient,125.0,0,0,Check-Out,2017-06-01,2017-05-29,False


Because the dataset does not have booking ID, unable to check if those are really duplicates or just multiple just multiple similar booking patterns from different customers due to customer mistakes.
Hence final conclusion to drop duplicates to avoid affecting the analysis quality

In [None]:
# remove duplicates
combhotel_clean_df = combinedhotel_df.drop_duplicates()

Final checks on the clean dataframe before performing exploratory data analysis

In [36]:
# rows and columns overview
combhotel_clean_df.shape

(86944, 33)

In [35]:
# check for nulls
nullcheck = combhotel_clean_df.isnull().sum()
print(nullcheck[nullcheck>0])

Series([], dtype: int64)


In [38]:
# data type checks
combhotel_clean_df.dtypes

hotel                                  object
LeadTime                                int64
ArrivalDateYear                         int64
ArrivalDateMonth                       object
ArrivalDateWeekNumber                   int64
ArrivalDateDayOfMonth                   int64
StaysInWeekendNights                    int64
StaysInWeekNights                       int64
Adults                                  int64
Children                                int64
Babies                                  int64
Meal                                   object
Country                                object
MarketSegment                          object
DistributionChannel                    object
IsRepeatedGuest                          bool
PreviousCancellations                   int64
PreviousBookingsNotCanceled             int64
ReservedRoomType                       object
AssignedRoomType                       object
BookingChanges                          int64
DepositType                       

In [42]:
# final check for duplicates
combhotel_clean_df.duplicated().sum()

np.int64(0)

In [44]:
combhotel_clean_df.to_csv('combhotel_clean_df.csv', index=False)