# 3. Data preparation

### Import the needed packages

In [53]:
import os
import json
from pickle import load, dump

import numpy as np
import pandas as pd
from pandas import DataFrame
from ydata.metadata import Metadata

from functions.get_dummies import GetDummies

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2

### Getting the environment variables values

In [54]:
data_split = os.environ.get('DATA_SPLIT', 'train')
dups = os.environ.get('KEEP_DUP', 'first')

### Auxiliary functions definition

In [55]:
#Function to load the training set
def load_sets(path: str):
    with open(path, 'rb') as f:
        return load(f)
    
def pickle_sets(df: pd.DataFrame, y: pd.Series, path:str):
    dataset = (df, y)
    
    with open(path, 'wb') as f:
        dump(dataset, f)

In [56]:
def profile_json(df: DataFrame):
    profile = ProfileReport(df = df)
    profile_json = json.loads(profile.json)
    
    return profile_json

def calculate_profile_diff(profiles: list):
    return profiles

In [57]:
def unique_levels(x):
    x = x.value_counts().count()
    return x

## Read the dataset, metadata & profiling

In [108]:
metadata = Metadata.load('bookings_metadata.pkl')

In [109]:
data, y = load_sets(f'{data_split}.pkl')
data['is_canceled']=y

## Prepare the data

### Check & act upon the metadata warnings

In [110]:
#remove duplicates
for warning_type, warnings in metadata.warnings.items():
    if warning_type == 'duplicates' and len(warnings)>0:
        ##filter the dataset duplicates
        data.drop_duplicates(keep=dups, inplace=True)

In [111]:
y = data['is_canceled']
data.drop(['is_canceled', 'booking_changes'], axis=1, inplace=True)

In [112]:
fltr_metadata=metadata[[col for col in metadata.columns if col not in ['is_canceled', 'booking_changes']]]

In [113]:
## Split Features into numerical and categorical
num = data[fltr_metadata.numerical_vars]
char = data[fltr_metadata.categorical_vars]

In [114]:
#Check num and categorical
# check for how many unique values each column has
df_value_counts = pd.DataFrame(num.apply(lambda X: unique_levels(X)))

df_value_counts.columns = ['feature_levels']
df_value_counts.sort_values(by = 'feature_levels', ascending=False)

slice1 = df_value_counts.loc[df_value_counts['feature_levels']<=20]
cat_list = slice1.index
cat = num.loc[:, cat_list]

slice2 = df_value_counts.loc[df_value_counts['feature_levels']>20]
num_list = slice2.index
num = num.loc[:, num_list]

char = pd.concat([char, cat], axis = 1, join = 'inner')

In [124]:
char = char.replace(np.nan, '')

In [115]:
## Dropping Variables that have >25% missing data
#only numerical has missing data
num = num.loc[:, num.isnull().mean() <= 0.25]

In [116]:
cols_split = {'cat': list(char.columns), 'num': list(num.columns)}

In [125]:
##Defining the imputers for the missing data identified
imputers= {'cat': SimpleImputer(missing_values='', strategy='most_frequent'),
           'num': SimpleImputer(missing_values=np.nan, strategy='mean')}

In [127]:
for k, imputer in imputers.items():
    if k=='cat':
        char_i = pd.DataFrame(imputer.fit_transform(char), index = char.index, columns=char.columns)
    elif k=='num':
        num_i = pd.DataFrame(imputer.fit_transform(num), index = num.index, columns=num.columns)

### Feature Selection

In [130]:
## Remove variables with zero variance
varselector = VarianceThreshold(threshold = 0)
varselector.fit_transform(num_i)

cols = varselector.get_support(indices=True)

##select_cols
num_i = num_i.iloc[:, cols]

##Create new features
num_i['day_wait_ind'] = np.where(num_i['days_in_waiting_list']>0, 1, 0)
num_i['previous_bookings_not_canceled_ind'] = np.where(num_i['previous_bookings_not_canceled']>0, 1, 0)
                                                             
try:
    num_i['booking_changes_ind'] = np.where(num_i['booking_changes']<0, 1, 0)

    num_varlist = ['arrival_date_day_of_month', 'arrival_date_week_number', 'days_in_waiting_list', 'previous_bookings_not_canceled', 'booking_changes']
except:
    num_varlist = ['arrival_date_day_of_month', 'arrival_date_week_number', 'days_in_waiting_list', 'previous_bookings_not_canceled']

num_i.drop(num_varlist, axis=1, inplace=True)

In [131]:
char_i

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,arrival_date_year,stays_in_weekend_nights,adults,children,babies,is_repeated_guest,previous_cancellations,required_car_parking_spaces,total_of_special_requests
82752,City Hotel,October,BB,PRT,Offline TA/TO,TA/TO,A,A,Non Refund,Transient,Canceled,2016,1,2,0.0,0,0,1,0,0
101242,City Hotel,November,BB,PRT,Corporate,Corporate,D,D,No Deposit,Transient,Check-Out,2016,0,1,0.0,0,1,0,0,1
48725,City Hotel,March,SC,PRT,Online TA,TA/TO,A,A,No Deposit,Transient-Party,Check-Out,2016,1,2,0.0,0,0,0,0,2
70855,City Hotel,June,HB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Canceled,2017,1,3,0.0,0,0,0,0,2
42541,City Hotel,September,BB,PRT,Direct,Direct,A,A,No Deposit,Transient,No-Show,2015,1,1,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32471,Resort Hotel,January,BB,GBR,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017,4,1,0.0,0,0,0,0,1
6259,Resort Hotel,May,HB,GBR,Offline TA/TO,TA/TO,E,E,No Deposit,Transient,Check-Out,2016,2,2,1.0,0,0,0,0,1
39452,Resort Hotel,August,BB,AUS,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017,1,2,0.0,0,0,0,0,2
1106,Resort Hotel,August,HB,PRT,Online TA,TA/TO,D,D,No Deposit,Transient,Canceled,2015,2,2,0.0,0,0,0,0,0


In [133]:
#Categorical features
try:
    charlist = ['arrival_date_month', 'country', 'assigned_room_type', 'reservation_status',
                'reservation_status_date', 'arrival_date_year']
    char_i.drop(charlist, axis=1, inplace=True)
except:
    charlist = ['arrival_date_month', 'country', 'assigned_room_type', 'reservation_status', 'arrival_date_year']
    char_i.drop(charlist, axis=1, inplace=True)

In [134]:
#LabelEncoder
#Encode all the categorical columns
le = GetDummies()

le.fit(char_i)
char_encoded = le.transform(char_i)

In [135]:
selector = SelectKBest(chi2, k=30)
selector.fit_transform(char_encoded, y)

#get columns to create new df with them only
cols = selector.get_support(indices=True)
select_features_df_char = char_encoded.iloc[:,cols]

In [136]:
##Setting the final DF
X_train = pd.concat([select_features_df_char, num_i], axis=1, join='inner')

In [137]:
selected_vars= {'cat': list(select_features_df_char.columns), 'num': list(num_i.columns)}

### Output methods for data preparation

In [138]:
with open('cols_split.pkl', 'wb') as f:
    dump(cols_split, f)

with open('imputers.pkl', 'wb') as f:
    dump(imputers, f)
    
with open('encoders.pkl', 'wb') as f:
    dump(le, f)
    
with open('selected_vars.pkl', 'wb') as f:
    dump(selected_vars, f)

## Output data

In [139]:
## Add here more details
pickle_sets(df=X_train, y=y, path=f'dataprep_{data_split}.pkl')