# Split the datasets

We prepare the dataset by splitting them into a training and a holdout.   
For each dataset, we calculate the metadata object for the whole dataset, the training set and the holdout.   

In [1]:
import os
from pathlib import Path

from ydata.dataset.holdout import Holdout
from ydata.metadata.column import Column
from ydata.metadata import Metadata
from ydata.dataset import Dataset
from ydata.utils.data_types import DataType

from common.config import *
from common.utils import load_dataframe

datasets_config = get_datsets_config()

for name in datasets_config.keys():
    print(f'# Get {name}')
    metadata_path = str(Path(DATASET_PATH) / f'{name}_all.metadata.pkl')
    dataset = Dataset(load_dataframe(name))
    if os.path.isfile(metadata_path):
        print(' -> Load metadada')
        metadata = Metadata.load(metadata_path)
    else:
        metadata = Metadata(dataset)
        for k, v in metadata.columns.items():
            if v.datatype == DataType.LONGTEXT:
                metadata.columns[k] = Column(k, DataType.CATEGORICAL, v.vartype)
        metadata.save(metadata_path)
    
    metadata_train_path = str(Path(DATASET_PATH) / f'{name}_train.metadata.pkl')
    metadata_holdout_path = str(Path(DATASET_PATH) / f'{name}_holdout.metadata.pkl')
    dataset_train_path = Path(DATASET_PATH) / f'{name}_train.csv'
    dataset_holdout_path = Path(DATASET_PATH) / f'{name}_holdout.csv'
    
    # If we are missing any file regarding the split, we recreate a split from scratch to avoid inconsistency
    if not os.path.isfile(metadata_train_path) \
        or not os.path.isfile(metadata_holdout_path) \
        or not os.path.isfile(dataset_train_path) \
        or not os.path.isfile(dataset_holdout_path):
        train, holdout = Holdout().get_split(dataset, metadata)

        metadata_train = Metadata(train)
        for k, v in metadata_train.columns.items():
            if v.datatype == DataType.LONGTEXT:
                metadata_train.columns[k] = Column(k, DataType.CATEGORICAL, v.vartype)
        metadata_train.save(metadata_train_path)

        metadata_holdout = Metadata(holdout)
        for k, v in metadata_holdout.columns.items():
            if v.datatype == DataType.LONGTEXT:
                metadata_holdout.columns[k] = Column(k, DataType.CATEGORICAL, v.vartype)
        metadata_holdout.save(metadata_holdout_path)

        train_df = train.to_pandas() 
        train_df.to_csv(dataset_train_path, index=False)

        holdout_df = holdout.to_pandas() 
        holdout_df.to_csv(dataset_holdout_path, index=False)
    else:
        print(' -> Holdout files already exists...')

# Get sdv.adult
 -> Load metadada
 -> Holdout files already exists...
