# Data preparation

## Consume both the metadata and the dataset profiling

In [99]:
import os
from pickle import load, dump
import json 

import numpy as np
import pandas as pd
from pandas import DataFrame
from ydata.metadata import Metadata

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin

class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        
    def fit(self, X, y=None):
        self.columns = pd.get_dummies(X, drop_first=True).columns
        return self

    def transform(self, X):
        X_new = pd.get_dummies(X, drop_first=True)
        return X_new.reindex(columns=self.columns, fill_value=0)

In [100]:
#Function to load the training set
def load_sets(path: str):
    with open(path, 'rb') as f:
        return load(f)
    
def pickle_sets(df: pd.DataFrame, y: pd.Series, path:str):
    dataset = (df, y)
    
    with open(path, 'wb') as f:
        dump(dataset, f)

In [101]:
def profile_json(df: DataFrame):
    profile = ProfileReport(df = df)
    profile_json = json.loads(profile.json)
    
    return profile_json

def calculate_profile_diff(profiles: list):
    return profiles

In [102]:
metadata = Metadata.load('metadata.pkl')

## Prepare the data

In [103]:
try:
    data_split=os.environ['DATA_SPLIT']
except:
    data_split = 'validation'
    
data, y = load_sets(f'{data_split}.pkl')

In [104]:
data.drop(['Unnamed: 0', 'booking_changes'], axis=1, inplace=True)

### Load the methods and preprocessing pipeline

In [105]:
with open('imputers.pkl', 'rb') as f:
    imputers = load(f)
    
with open('encoders.pkl', 'rb') as f:
    encoders= load(f)
    
with open('selected_vars.pkl', 'rb') as f:
    selected_vars = load(f)
    
with open('cols_split.pkl', 'rb') as f:
    cols_split = load(f)

In [106]:
## Split Features into numerical and categorical
num = data[cols_split['num']]
char = data[cols_split['cat']]

In [108]:
for k, imputer in imputers.items():
    if k=='cat':
        char_i = pd.DataFrame(imputer.transform(char), index = char.index, columns=char.columns)
    elif k=='num':
        num_i = pd.DataFrame(imputer.transform(num), index = num.index, columns=num.columns)

In [109]:
##Create new features
num_i['day_wait_ind'] = np.where(num_i['days_in_waiting_list']>0, 1, 0)
num_i['previous_bookings_not_canceled_ind'] = np.where(num_i['previous_bookings_not_canceled']>0, 1, 0)

try:
    num_i['booking_changes_ind'] = np.where(num_i['booking_changes']<0, 1, 0)
except:
    print('No booking changes ind')

In [110]:
#Encode the categorical features

char_encoded = encoders.transform(char_i)

In [111]:
##Filter by the selected variables

num_data = num_i[selected_vars['num']]
char_data = char_encoded[selected_vars['cat']]

In [112]:
X_validation = pd.concat([char_data, num_data], axis=1, join='inner')

## Output data

In [113]:
## Add here more details
pickle_sets(df=X_validation, y=y, path=f'dataprep_validation.pkl')