# MSiA 420 - Predictive Analytics II - Final Project
## Group 6: Alejandra Lelo de Larrea Ibarra, Kiran Jyothi Sheena, Lixuan (Ellen) Chen, Wencheng Zhang

# Cleaning data & Feature Engineering

In [48]:
# Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns

from sklearn.preprocessing import StandardScaler
#from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix

## Read standardized data

In [50]:
# read std data
df = pd.read_csv("../02_Data/hotel_bookings_dummy.csv")
df.head()

Unnamed: 0,adults,agent_risk,arrival_date_year,arrival_month,babies,booked_by_agent,booked_by_company,booking_changes,children,domestic,...,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,meal_FB,meal_HB,meal_SC,meal_Undefined
0,2,0,2015,7,0,0,0,3,0,1,...,0,1,0,0,0,0,0,0,0,0
1,2,0,2015,7,0,0,0,4,0,1,...,0,1,0,0,0,0,0,0,0,0
2,1,0,2015,7,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,2015,7,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2,2,2015,7,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [51]:
# Number of features
df.shape

(119388, 44)

In [52]:
# descriptive stats 
df.describe()

Unnamed: 0,adults,agent_risk,arrival_date_year,arrival_month,babies,booked_by_agent,booked_by_company,booking_changes,children,domestic,...,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,meal_FB,meal_HB,meal_SC,meal_Undefined
count,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,...,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0
mean,1.856401,1.569856,2016.156548,6.552543,0.007949,0.863135,0.056932,0.221103,0.103888,0.406984,...,0.044351,0.105589,0.16593,0.202851,0.473054,1.7e-05,0.006684,0.121143,0.089205,0.009792
std,0.579266,1.032156,0.707478,3.09061,0.097437,0.343706,0.231714,0.652287,0.398558,0.491274,...,0.205875,0.307312,0.372019,0.402124,0.499275,0.004093,0.081483,0.326295,0.285041,0.098467
min,0.0,0.0,2015.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,2016.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,1.0,2016.0,7.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,3.0,2017.0,9.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,55.0,3.0,2017.0,12.0,10.0,1.0,1.0,21.0,10.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
# Print feature names 
df.columns

Index(['adults', 'agent_risk', 'arrival_date_year', 'arrival_month', 'babies',
       'booked_by_agent', 'booked_by_company', 'booking_changes', 'children',
       'domestic', 'got_room_booked', 'hotel', 'is_canceled',
       'is_repeated_guest', 'log_adr', 'log_days_in_waiting_list',
       'log_lead_time', 'previous_bookings', 'previous_cancellations',
       'required_car_parking', 'total_nights', 'total_of_special_requests',
       'continent_Americas', 'continent_Antarctica', 'continent_Asia',
       'continent_Europe', 'continent_Oceania', 'continent_unknown',
       'customer_type_Group', 'customer_type_Transient',
       'customer_type_Transient-Party', 'deposit_type_Non Refund',
       'deposit_type_Refundable', 'market_segment_Complementary',
       'market_segment_Corporate', 'market_segment_Direct',
       'market_segment_Groups', 'market_segment_Offline TA/TO',
       'market_segment_Online TA', 'market_segment_Undefined', 'meal_FB',
       'meal_HB', 'meal_SC', 'meal_Unde

## Splitting training and test 

In [54]:
# get train indexes
train_ind = pd.read_csv("../02_Data/train_index.csv")

# split train
train = df.iloc[train_ind.Train_Index,:]
train.reset_index(inplace = True)
train.head()

# Split test 
test = df.iloc[df.index.difference(train_ind.Train_Index),:]
test

Unnamed: 0,adults,agent_risk,arrival_date_year,arrival_month,babies,booked_by_agent,booked_by_company,booking_changes,children,domestic,...,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,meal_FB,meal_HB,meal_SC,meal_Undefined
2,1,0,2015,7,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,2015,7,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,2,2,2015,7,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
9,2,1,2015,7,0,1,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
10,2,2,2015,7,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119351,3,3,2017,8,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
119353,1,1,2017,8,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
119372,2,3,2017,8,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
119374,3,3,2017,8,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [55]:
# Check class imbalance in train and test 
print("Train Class Response:")
print(train.is_canceled.value_counts()/train.shape[0]*100)

print("Test Class Response:")
print(test.is_canceled.value_counts()/test.shape[0]*100)

Train Class Response:
0    62.941441
1    37.058559
Name: is_canceled, dtype: float64
Test Class Response:
0    63.027181
1    36.972819
Name: is_canceled, dtype: float64


## Scaling training data 

In [57]:
scaler = StandardScaler()
scaler.fit(train.drop("is_canceled", axis = 1))

StandardScaler()

In [None]:
https://www.justintodata.com/logistic-regression-example-in-python/