# 5. Data preparation

### Import needed packages

In [24]:
import os
from pickle import load, dump
import json 

import numpy as np
import pandas as pd
from pandas import DataFrame

from ydata.metadata import Metadata

from functions.save_load import load_sets, pickle_sets

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from functions.get_dummies import GetDummies

### Getting the environment variables values

In [5]:
data_split=os.environ.get('DATA_SPLIT', 'validation')

### Auxiliary functions definition

In [7]:
def profile_json(df: DataFrame):
    profile = ProfileReport(df = df)
    profile_json = json.loads(profile.json)
    
    return profile_json

def calculate_profile_diff(profiles: list):
    return profiles

## Get metadata and dataset profiling

In [8]:
metadata = Metadata.load('bookings_metadata.pkl')

In [9]:
data, y = load_sets(f'{data_split}.pkl')
data.drop(['booking_changes'], axis=1, inplace=True)

## Prepare the data

### Load the methods and preprocessing pipeline

In [12]:
with open('imputers.pkl', 'rb') as f:
    imputers = load(f)
    
with open('encoders.pkl', 'rb') as f:
    encoders= load(f)
    
with open('selected_vars.pkl', 'rb') as f:
    selected_vars = load(f)
    
with open('cols_split.pkl', 'rb') as f:
    cols_split = load(f)

In [15]:
## Split Features into numerical and categorical
num = data[cols_split['num']]
char = data[cols_split['cat']]

char = char.replace(np.nan, '')

In [16]:
for k, imputer in imputers.items():
    if k=='cat':
        char_i = pd.DataFrame(imputer.transform(char), index = char.index, columns=char.columns)
    elif k=='num':
        num_i = pd.DataFrame(imputer.transform(num), index = num.index, columns=num.columns)

In [17]:
##Create new features
num_i['day_wait_ind'] = np.where(num_i['days_in_waiting_list']>0, 1, 0)
num_i['previous_bookings_not_canceled_ind'] = np.where(num_i['previous_bookings_not_canceled']>0, 1, 0)

try:
    num_i['booking_changes_ind'] = np.where(num_i['booking_changes']<0, 1, 0)
except:
    print('No booking changes ind')

No booking changes ind


In [18]:
#Encode the categorical features

char_encoded = encoders.transform(char_i)

In [21]:
##Filter by the selected variables
num_data = num_i[selected_vars['num']]
char_data = char_encoded[selected_vars['cat']]

In [22]:
X_validation = pd.concat([char_data, num_data], axis=1, join='inner')

## Pipeline Output

In [23]:
## Add here more details
pickle_sets(df=X_validation, y=y, path=f'dataprep_validation.pkl')