In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#from scipy import stats
#from imblearn import under_sampling, over_sampling
#from sklearn.model_selection import train_test_split


from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
dfhotel = pd.read_csv('data.csv')

dfhotel.sample(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,adults,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
83174,City Hotel,0,0,2018,March,4,22,0,1,2,...,0,No Deposit,9.0,,0,Personal,91.0,0,0,Check-Out
72167,City Hotel,1,139,2019,September,29,22,2,5,2,...,0,No Deposit,9.0,,0,Personal,166.5,0,0,Canceled
11968,Resort Hotel,1,60,2019,August,23,5,3,6,1,...,0,No Deposit,240.0,,0,Personal,147.0,0,2,Canceled
43468,City Hotel,0,74,2017,November,38,18,0,2,2,...,0,No Deposit,6.0,,0,Family,109.0,0,2,Check-Out
54189,City Hotel,1,173,2018,September,28,8,0,2,2,...,0,No Deposit,9.0,,0,Personal,85.5,0,0,Canceled


In [4]:
print(dfhotel.shape)

(119390, 29)


Datarawnya memiliki 119390 data dan 29 kolom  <br>


In [7]:
dfhotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_weekdays_nights        119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

## **1. Data Cleansing**
***

#### A. Missing Handle Value


In [8]:
dfhotel.isnull().sum().sort_values(ascending = False)

company                           112593
agent                              16340
city                                 488
children                               4
hotel                                  0
is_repeated_guest                      0
total_of_special_requests              0
required_car_parking_spaces            0
adr                                    0
customer_type                          0
days_in_waiting_list                   0
deposit_type                           0
booking_changes                        0
previous_bookings_not_canceled         0
previous_cancellations                 0
market_segment                         0
distribution_channel                   0
is_canceled                            0
meal                                   0
babies                                 0
adults                                 0
stays_in_weekdays_nights               0
stays_in_weekend_nights                0
arrival_date_day_of_month              0
arrival_date_wee

In [9]:
#Percentage of missing values by column

round((dfhotel.isnull().sum().sort_values(ascending = False) * 100) / len(dfhotel), 2)

company                           94.31
agent                             13.69
city                               0.41
children                           0.00
hotel                              0.00
is_repeated_guest                  0.00
total_of_special_requests          0.00
required_car_parking_spaces        0.00
adr                                0.00
customer_type                      0.00
days_in_waiting_list               0.00
deposit_type                       0.00
booking_changes                    0.00
previous_bookings_not_canceled     0.00
previous_cancellations             0.00
market_segment                     0.00
distribution_channel               0.00
is_canceled                        0.00
meal                               0.00
babies                             0.00
adults                             0.00
stays_in_weekdays_nights           0.00
stays_in_weekend_nights            0.00
arrival_date_day_of_month          0.00
arrival_date_week_number           0.00


1. Kolom 'Company' dan 'agent' memiliki jumlah nilai yang hilang yang sangat tinggi yaitu 94,31% dan 13,69%.kolom tersebut ingin saya hapus, tetapi saya tidak menghapusnya melainkan akan mengganti nilai null pada kolom-kolom tersebut dengan nol (karena mereka memiliki tipe data float64). Karena,setelah saya membaca  artikel [Hotel booking demand](https://www.sciencedirect.com/science/article/pii/S2352340918315191) menyatakan sebagai berikut:

>Pada beberapa variabel kategorikal seperti Agen atau Perusahaan, "NULL" ditampilkan sebagai salah satu kategori. Hal ini tidak boleh dianggap sebagai nilai yang hilang, melainkan sebagai "tidak berlaku". Sebagai contoh, jika pemesanan "Agen" didefinisikan sebagai "NULL", itu berarti bahwa pemesanan tersebut tidak berasal dari agen perjalanan.


2. Di sisi lain, kolom 'city' memiliki 0,41% nilai yang hilang (488 baris). Dalam kasus ini, sebagai variabel kategorikal, saya memilih mode untuk mengganti nilai 0(nol).

3. Kolom 'children' juga memiliki beberapa nilai yang hilang, tetapi mereka bahkan tidak mewakili 0,01% . Oleh karena itu, saya memilih untuk menghapus baris yang terpengaruh sejumlah 4 baris. 

In [5]:
#Mengganti nilai NULL di kolom company dan agent dengan 0
values = {'company': 0, 'agent': 0}
dfhotel.fillna(value = values, inplace = True)


#Mengganti nilai NULL di kolom negara dengan nilai 0 (nol)
dfhotel['city'].fillna(value = dfhotel['city'].mode()[0], inplace = True)


#Menghapus baris oleh nilai NULL di kolom childer

dfhotel.dropna(subset = ['children'], inplace = True)

In [6]:
#Check setelah perubahan

dfhotel.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_weekdays_nights          0
adults                            0
children                          0
babies                            0
meal                              0
city                              0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
booking_changes                   0
deposit_type                      0
agent                             0
company                           0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests   

*sudah tidak ada lagi missing value di datasheet*

#### Data Tidak Konsisten

In [16]:
# mengelompokkan kolom kategorikal

cat_columns = ['hotel', 'is_canceled', 'meal', 'city', 'market_segment', 'distribution_channel', 'is_repeated_guest','deposit_type', 'customer_type', 'reservation_status']

In [17]:
# Unique values in each categorical column

print(f"Data unik di kolom kategorikal\n".upper())

for cat_column in cat_columns:
    unique_values = dfhotel[cat_column].unique()
    print(f"\n{cat_column}: \n{unique_values}\n")
    print('-' * 70)

DATA UNIK DI KOLOM KATEGORIKAL


hotel: 
['Resort Hotel' 'City Hotel']

----------------------------------------------------------------------

is_canceled: 
[0 1]

----------------------------------------------------------------------

meal: 
['Breakfast' 'Full Board' 'Dinner' 'No Meal' 'Undefined']

----------------------------------------------------------------------

city: 
['Kota Denpasar' 'Kabupaten Bangka' 'Kabupaten Sleman' 'Kota Batu'
 'Kota Malang' 'Kota Yogyakarta' 'Kabupaten Tangerang' 'Kota Semarang'
 'Kota Serang' 'Kota Tangerang' 'Kota Bogor' 'Kabupaten Bandung'
 'Kabupaten Magelang' 'Kabupaten Sumedang' 'Kota Jakarta Timur'
 'Kabupaten Purwakarta' 'Kabupaten Kepulauan Seribu' 'Kabupaten Belitung'
 'Kota Jakarta Pusat' 'Kota Jakarta Selatan' 'Kota Surabaya'
 'Kota Jakarta Utara' 'Kota Gorontalo' 'Kota Jambi' 'Kota Jakarta Barat'
 'Kota Bengkulu' 'Kabupaten Bandung Barat' 'Kabupaten Bekasi'
 'Kabupaten Bogor' 'Kabupaten Ciamis' 'Kabupaten Cianjur'
 'Kabupaten Cirebon' 'K

Berdasarkan hal ini, di kolom 'meal' ada 5 hasil: ['BB' 'FB' 'HB' 'SC' 'undefined'].

Namun, kategori 'undefined' sebenarnya sesuai dengan 'SC'( Self Catering atau dalam bahasan indonesia katering mandiri, maka tidak ada makanan yang disertakan). oleh sebab itu saya menggantinya menjadi 'SC'.

In [19]:
#Mengganti 'undefined' menjadi 'SC'

dfhotel['meal'].replace(to_replace = 'Undefined', value = 'SC', inplace = True)

In [20]:
#check kembali

dfhotel['meal'].unique()

array(['Breakfast', 'Full Board', 'Dinner', 'No Meal', 'SC'], dtype=object)

> Tidak ada lagi data yang tidak konsisten dalam dataset

#### Data tidak sesuai

In [24]:
dfhotel.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0
mean,0.370395,104.014801,2018.156593,27.165003,15.798553,0.927605,2.50031,1.85639,0.10389,0.007949,0.031913,0.087121,0.137102,0.221131,74.830633,10.775518,2.321227,101.833541,0.06252,0.57134
std,0.482913,106.863286,0.707456,13.605334,8.780783,0.998618,1.908289,0.579261,0.398561,0.097438,0.17577,0.84435,1.497462,0.652315,107.142996,53.944751,17.595011,50.534664,0.245295,0.792798
min,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2018.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2018.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,94.59,0.0,0.0
75%,1.0,160.0,2019.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,152.0,0.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2019.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


1. kita dapat melihat bahwa kolom 'previous_cancellations' memiliki nilai paling tinggi 26 pembatalan, yang mengimplikasikan bahwa beberapa pelanggan melakukan 26 kali pembatalan, yang mana hal ini tidak mungkin terjadi.
<br>
2.  kolom 'adult' memiliki nilai tertinngi 55 dan terendah 0 orang. Nilai minimum ini mengasumsikan bahwa ada reservasi hotel untuk 0 orang dewasa, yang tidak mungkin karena harus ada minimal 1 orang dewasa per reservasi (jelas anak-anak tidak dapat memesan kamar hotel). Oleh karena itu, saya akan menghilangkan baris yang jumlah orang dewasanya sama dengan 0.

In [23]:
#mendrop baris dengan 0 adult

dfhotel.drop(dfhotel[dfhotel['adults'] == 0].index, inplace = True)

In [24]:
#checking ulang

len(dfhotel[dfhotel['adults'] == 0])

0

# Tugas 2