# Convert custom data to TFRecord

Demonstrates how to convert your custom data to TFRecord format. 


In [None]:
%config IPCompleter.greedy=True

In [None]:
!pip install tensorflow
!pip install tfx
!pip install tensorflow-model-analysis

In [27]:
import tensorflow as tf 
import csv
import os, pwd
from tfx.utils.dsl_utils import external_input
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.components import (
    FileBasedExampleGen,
    ImportExampleGen
)

## Helper Functions

In [None]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def clean_rows(row):
    if not row["zip_code"]:
        row["zip_code"] = "99999"
    return row

def convert_zipcode_to_int(zipcode):
    if isinstance(zipcode, str) and "XX" in zipcode:
        zipcode = zipcode.replace("XX", "00")
    int_zipcode = int(zipcode)
    return int_zipcode

## Convert the csv file to tfrecord format

In [42]:
base_dir = pwd.getpwuid(os.getuid()).pw_dir
data_dir_str = 'Github/building-machine-learning-pipelines/data'
data_dir = os.path.join(base_dir, data_dir_str)
original_data_file = os.path.join(data_dir, 'consumer_complaints_with_narrative.csv')
tfrecord_path = os.path.join(data_dir, 'tfrecords')
try:
    os.mkdir(tfrecord_path)
except:
    print(tfrecord_path + ' already exists.')
    
tfrecord_filename = os.path.join(tfrecord_path, 'consumer-complaints.tfrecord')

tfrecord_writer = tf.io.TFRecordWriter(tfrecord_filename)

with open(original_data_file) as csv_file:
    reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')
    for row in reader:
        row = clean_rows(row)
        example = tf.train.Example(features=tf.train.Features(feature={
            "product": _bytes_feature(row["product"]),
            "sub_product": _bytes_feature(row["sub_product"]),
            "issue": _bytes_feature(row["issue"]),
            "sub_issue": _bytes_feature(row["sub_issue"]),
            "state": _bytes_feature(row["state"]),
            "zip_code": _int64_feature(convert_zipcode_to_int(row["zip_code"])),
            #"zip_code": _bytes_feature(row["zip_code"]),
            "company": _bytes_feature(row["company"]),
            "company_response": _bytes_feature(row["company_response"]),
            "consumer_complaint_narrative": _bytes_feature(row["consumer_complaint_narrative"]),
            "timely_response": _bytes_feature(row["timely_response"]),
            "consumer_disputed": _bytes_feature(row["consumer_disputed"])
        }))
        tfrecord_writer.write(example.SerializeToString())
    tfrecord_writer.close()
    
context = InteractiveContext()
#examples = external_input(tfrecord_path)
example_gen = ImportExampleGen(input_base=tfrecord_path)
context.run(example_gen)


/Users/seanwilliams/Github/building-machine-learning-pipelines/data/tfrecords already exists.




0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } ImportExampleGen at 0x7fa5e32ddb50.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fa593e9c2b0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']/Users/seanwilliams/Github/building-machine-learning-pipelines/data/tfrecords['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:95287115,xor_checksum:1613797163,sum_checksum:1613797163"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fa593e9c2b0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fa593e9c2b0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/Users/seanwilliams/Github/building-machine-learning-pipelines/data/tfrecords['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:95287115,xor_checksum:1613797163,sum_checksum:1613797163"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fa593e9c2b0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/Users/seanwilliams/Github/building-machine-learning-pipelines/data/tfrecords
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:95287115,xor_checksum:1613797163,sum_checksum:1613797163"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fa593e9c2b0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1) at 0x7fa5d05cd1c0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/var/folders/75/_jjmf7hn5n789yrm6nbscs000000gn/T/tfx-interactive-2021-02-19T22_59_23.406358-g7pxp1t4/ImportExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0
