# Data Preparation

Demonstrates how to split datasets with CsvExampleGen.



In [None]:
%config IPCompleter.greedy=True

In [None]:
!pip install tensorflow
!pip install tfx
!pip install tensorflow-model-analysis

In [None]:
import tensorflow as tf 
import csv
import os, pwd
from tfx.utils.dsl_utils import external_input
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.components import (
    CsvExampleGen,
    FileBasedExampleGen,
    ImportExampleGen
)
from tfx.proto import example_gen_pb2

## Errors when the code below
If you receive an error `RuntimeError: Files in same split /Users/{usr}/Github/building-machine-learning-pipelines/data/* have different header.`, then execute the following command in the directory specified in the error.

`ls -la`

This will show that there is a hidden directory there. You must delete that checkpoint folder for this code to work. TFX will read all files/folders and generate an error is all the files in the directory are not CSV.

```
drwxr-xr-x   4       staff       128 Feb 20 08:25 .
drwxr-xr-x  24 user  staff       768 Feb  8 21:55 ..
drwxr-xr-x   2 user  staff        64 Feb 20 08:25 .ipynb_checkpoints
-rw-r--r--@  1 user  staff  78956235 Feb 19 22:24 consumer_complaints_with_narrative.csv

```

## Splitting the Dataset
Splitting the input datasset into multiple **TFRecord** files.  

In [None]:
print(os.getcwd())
print(os.pardir)

In [None]:
base_dir = pwd.getpwuid(os.getuid()).pw_dir
#base_dir = os.getcwd()
data_dir_str = 'Github/building-machine-learning-pipelines/data'
data_dir = os.path.join(base_dir, data_dir_str)
original_data_file = os.path.join(data_dir, 'consumer_complaints_with_narrative.csv')


# new_data_path = 'chap3_4_data_preparation_output'
# try:
#     os.mkdir(os.path.join(data_dir, new_data_path))
# except:
#     print(new_data_path + ' already exists.')
    
output = example_gen_pb2.Output(
    split_config=example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=6),
        example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=2),
        example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=2)]))
examples = external_input(data_dir)
example_gen = CsvExampleGen(input=examples, output_config=output)
    
        
context = InteractiveContext()
context.run(example_gen)


## View the output of the data


In [None]:
for artifact in example_gen.outputs['examples'].get():
    print(artifact)

## Preserving the input split
If you data set was already split, then you can preserve it by using the input_config. This assumes that you have a train, test and eval directories with csv files included in the data directory.  


In [None]:
base_dir = pwd.getpwuid(os.getuid()).pw_dir
#base_dir = os.getcwd()
data_dir_str = 'Github/building-machine-learning-pipelines/data'
data_dir = os.path.join(base_dir, data_dir_str)
original_data_file = os.path.join(data_dir, 'consumer_complaints_with_narrative.csv')


# new_data_path = 'chap3_4_data_preparation_output'
# try:
#     os.mkdir(os.path.join(data_dir, new_data_path))
# except:
#     print(new_data_path + ' already exists.')
    
input = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='train', pattern='train/*'),
        example_gen_pb2.Input.Split(name='eval', pattern='eval/*'),
        example_gen_pb2.Input.Split(name='test', pattern='test/*')])
examples = external_input(data_dir)
example_gen = CsvExampleGen(input=examples, input_config=output)
    
        
context = InteractiveContext()
context.run(example_gen)



## Spanning a dataset
This shows how to span a data set. 

If process creates a new file with the previous dataset plus the new records, then you can use this code to include the new dataset for training.

In [None]:
from tfx.utils.dsl_utils import external_input
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.components import (
    CsvExampleGen
)
from tfx.proto import example_gen_pb2

In [None]:
base_dir = os.getcwd()
data_dir = os.path.join(os.pardir, "data")

input = example_gen_pb2.Input(splits=[example_gen_pb2.Input.Split(pattern='export-{SPAN}/*')])

examples = external_input(os.path.join(base_dir,data_dir))
example_gen = CsvExampleGen(innput=examples, input_config=input)

context = InteractiveContext()
context.run(example_gen)