# Data preparation

## Consume both the metadata and the dataset profiling

In [398]:
import os
from pickle import load, dump
import json 

import numpy as np
import pandas as pd
from pandas import DataFrame
from ydata.metadata import Metadata

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2

from sklearn.base import BaseEstimator, TransformerMixin

class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        
    def fit(self, X, y=None):
        self.columns = pd.get_dummies(X, drop_first=True).columns
        return self

    def transform(self, X):
        X_new = pd.get_dummies(X, drop_first=True)
        return X_new.reindex(columns=self.columns, fill_value=0)

In [399]:
#Function to load the training set
def load_sets(path: str):
    with open(path, 'rb') as f:
        return load(f)
    
def pickle_sets(df: pd.DataFrame, y: pd.Series, path:str):
    dataset = (df, y)
    
    with open(path, 'wb') as f:
        dump(dataset, f)

In [400]:
def profile_json(df: DataFrame):
    profile = ProfileReport(df = df)
    profile_json = json.loads(profile.json)
    
    return profile_json

def calculate_profile_diff(profiles: list):
    return profiles

In [401]:
metadata = Metadata.load('metadata.pkl')

## Prepare the data

In [402]:
try:
    data_split=os.environ['DATA_SPLIT']
except:
    data_split = 'train'
    
data, y = load_sets(f'{data_split}.pkl')

In [403]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [404]:
data['is_canceled']=y

### Check & act upon the metadata warnings

In [405]:
#remove duplicates
try:
    dups = os.environ['KEEP_DUP']
except:
    dups = 'first'
for warning_type, warnings in metadata.warnings.items():
    if warning_type.value == 'duplicates' and len(warnings)>0:
        ##filter the dataset duplicates
        data.drop_duplicates(keep=dups, inplace=True)

In [406]:
y = data['is_canceled']
data.drop(['is_canceled', 'booking_changes'], axis=1, inplace=True)

In [407]:
## Split Features into numerical and categorical
num = data.select_dtypes(include='number')
char = data.select_dtypes(include='object')

In [408]:
#Check num and categorical
# check for how many unique values each column has
def unique_levels(x):
    x = x.value_counts().count()
    return x

df_value_counts = pd.DataFrame(num.apply(lambda X: unique_levels(X)))

df_value_counts.columns = ['feature_levels']
df_value_counts.sort_values(by = 'feature_levels', ascending=False)

slice1 = df_value_counts.loc[df_value_counts['feature_levels']<=20]
cat_list = slice1.index
cat = num.loc[:, cat_list]

slice2 = df_value_counts.loc[df_value_counts['feature_levels']>20]
num_list = slice2.index
num = num.loc[:, num_list]

char = pd.concat([char, cat], axis = 1, join = 'inner')

In [409]:
## Dropping Variables that have >25% missing data
#only numerical has missing data
num = num.loc[:, num.isnull().mean() <= 0.25]

In [410]:
cols_split = {'cat': list(char.columns), 'num': list(num.columns)}

In [411]:
##Impute missing data
imputers= {'cat': SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
           'num': SimpleImputer(missing_values=np.nan, strategy='mean')}

for k, imputer in imputers.items():
    if k=='cat':
        char_i = pd.DataFrame(imputer.fit_transform(char), index = char.index, columns=char.columns)
    elif k=='num':
        num_i = pd.DataFrame(imputer.fit_transform(num), index = num.index, columns=num.columns)

In [413]:
num_i

Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_week_nights,previous_bookings_not_canceled,agent,days_in_waiting_list,adr,day_wait_ind
79839,34.0,50.0,8.0,2.0,0.0,19.000000,0.0,90.00,0
76585,208.0,31.0,28.0,1.0,0.0,9.000000,0.0,76.50,0
61731,18.0,52.0,24.0,1.0,0.0,85.000000,0.0,80.00,0
24224,236.0,20.0,12.0,3.0,0.0,315.000000,0.0,66.00,0
19582,0.0,52.0,26.0,1.0,0.0,93.744664,0.0,50.00,0
...,...,...,...,...,...,...,...,...,...
42287,63.0,36.0,4.0,2.0,0.0,21.000000,0.0,116.00,0
50797,10.0,20.0,8.0,1.0,0.0,9.000000,0.0,126.00,0
9145,40.0,44.0,29.0,1.0,0.0,240.000000,0.0,85.73,0
91449,79.0,25.0,16.0,4.0,0.0,9.000000,0.0,152.40,0


### Feature Selection

In [414]:
## Remove variables with zero variance
varselector = VarianceThreshold(threshold = 0)
varselector.fit_transform(num_i)

cols = varselector.get_support(indices=True)

##select_cols
num_i = num_i.iloc[:, cols]

##Create new features
num_i['day_wait_ind'] = np.where(num_i['days_in_waiting_list']>0, 1, 0)
num_i['previous_bookings_not_canceled_ind'] = np.where(num_i['previous_bookings_not_canceled']>0, 1, 0)
                                                             
try:
    num_i['booking_changes_ind'] = np.where(num_i['booking_changes']<0, 1, 0)

    num_varlist = ['arrival_date_day_of_month', 'arrival_date_week_number', 'days_in_waiting_list', 'previous_bookings_not_canceled', 'booking_changes']
except:
    num_varlist = ['arrival_date_day_of_month', 'arrival_date_week_number', 'days_in_waiting_list', 'previous_bookings_not_canceled']

num_i.drop(num_varlist, axis=1, inplace=True)

In [None]:
#Categorical features
try:
    charlist = ['arrival_date_month', 'country', 'assigned_room_type', 'reservation_status',
                'reservation_status_date', 'arrival_date_year']
    char_i.drop(charlist, axis=1, inplace=True)
except:
    charlist = ['arrival_date_month', 'country', 'assigned_room_type', 'reservation_status',
                'reservation_status_date', 'arrival_date_year', 'booking_changes']
    char_i.drop(charlist, axis=1, inplace=True)

In [None]:
#LabelEncoder
#Encode all the categorical columns
le = GetDummies()

le.fit(char_i)
char_encoded = le.transform(char_i)

In [None]:
selector = SelectKBest(chi2, k=30)
selector.fit_transform(char_encoded, y)

#get columns to create new df with them only
cols = selector.get_support(indices=True)
select_features_df_char = char_encoded.iloc[:,cols]

In [None]:
##Setting the final DF
X_train = pd.concat([select_features_df_char, num_i], axis=1, join='inner')

In [None]:
selected_vars= {'cat': list(select_features_df_char.columns), 'num': list(num_i.columns)}

### Output methods for data preparation

In [396]:
with open('cols_split.pkl', 'wb') as f:
    dump(cols_split, f)

with open('imputers.pkl', 'wb') as f:
    dump(imputers, f)
    
with open('encoders.pkl', 'wb') as f:
    dump(le, f)
    
with open('selected_vars.pkl', 'wb') as f:
    dump(selected_vars, f)

## Output data

In [397]:
## Add here more details
pickle_sets(df=X_train, y=y, path=f'dataprep_{data_split}.pkl')