# Creating the train, validation and holdout sets

### Import the need packages

In [17]:
import os 
from pickle import load, dump, HIGHEST_PROTOCOL

import pandas as pd
from sklearn.model_selection import train_test_split

### Getting environment variables values

In [18]:
train_size = float(os.environ.get('TRAIN_SIZE', 0.8))
validation_size = float(os.environ.get('TEST_SIZE', 0.5))

## Read the dataset & metadata

In [19]:
with open('bookings.pkl', 'rb') as file:
    data = load(file)
    
df = data.to_pandas()

In [20]:
X = df.drop(columns=['is_canceled']).copy()
y = df['is_canceled']

## Create the dataset split into train & validation sets

In [21]:
## Creating the first split - Train & Validation + holdout
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=train_size, stratify=y)
    
X_val, X_holdout, y_val, y_holdout = train_test_split(X_rem, y_rem, test_size=validation_size, stratify=y_rem)

In [22]:
print(X_train.shape)
print(X_val.shape)
print(X_holdout.shape)

(95512, 31)
(11939, 31)
(11939, 31)


## Pipeline outputs

In [23]:
def pickle_sets(df: pd.DataFrame, y: pd.Series, path:str):
    dataset = (df, y)
    
    with open(path, 'wb') as f:
        dump(dataset, f)

In [24]:
## Saving the sets of data
pickle_sets(df=X_train, y=y_train, path='train.pkl')
pickle_sets(df=X_val, y=y_val, path='validation.pkl')
pickle_sets(df=X_holdout, y=y_holdout, path='holdout.pkl')