##### How to convert a large CSV to tfRecord format

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
inp = '/home/tbrownex/data/LSTM/jena_climate.csv'
out = '/home/tbrownex/data/LSTM/jena_climate.tfrecord'

In [3]:
df=pd.read_csv(inp)

# Build a dictionary specifying column type. Only 1 string so set them all to float then fix the string
featureDict={}
for col in df.columns:
    featureDict[col]=tf.float32

featureDict['Date Time'] = tf.string

In [4]:
def parseRow(row):
    d={}
    d['Date Time'] = row[0]
    d['p (mbar)'] = row[1]
    d['T (degC)'] = row[2]
    d['Tpot (K)'] = row[3]
    d['Tdew (degC)'] = row[4]
    d['rh (%)'] = row[5]
    d['VPmax (mbar)'] = row[6]
    d['VPact (mbar)'] = row[7]
    d['VPdef (mbar)'] = row[8]
    d['sh (g/kg)'] = row[9]
    d['H2OC (mmol/mol)'] = row[10]
    d['rho (g/m**3)'] = row[11]
    d['wv (m/s)'] = row[12]
    d['max. wv (m/s)'] = row[13]
    d['wd (deg)'] = row[14]
    return d

In [5]:
def gen():
    with open(inp) as f:
        next(f)
        for row in f:
            parsed = row.split(',')
            yield parseRow(parsed)

In [6]:
serialized_features_dataset = tf.data.Dataset.from_generator(gen,
                                                             output_types=featureDict)

In [7]:
def _float_feature(value):
    # Returns a float_list from a float / double
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [9]:
with tf.io.TFRecordWriter(out) as writer:
    for row in serialized_features_dataset:
        d={}
        for key in row.keys():
            if row[key].dtype == 'string':
                d[key] = _bytes_feature(row[key])
            else:
                d[key] = _float_feature(row[key])
        example_proto = tf.train.Example(features=tf.train.Features(feature=d))
        writer.write(example_proto.SerializeToString())

##### Read it back in

In [9]:
raw_dataset = tf.data.TFRecordDataset(out)

In [10]:
def _parse_function(example_proto):
    # Parse the input `tf.train.Example` proto using the dictionary above.
    return tf.io.parse_single_example(example_proto, feature_description)

In [6]:
def printDS(parsed_dataset):
    for n in parsed_dataset.take(10):
        print(n)

In [11]:
# featureDict has the name of the columns
# Set all columns to floats, then change the one that isn't (Date Time is a string)
feature_description={}
for key in featureDict.keys():
    feature_description[key] = tf.io.FixedLenFeature([], tf.float32, default_value=0)
feature_description['Date Time'] = tf.io.FixedLenFeature([], tf.string, default_value=' ')

In [12]:
parsed_dataset = raw_dataset.map(_parse_function)
#printDS(parsed_dataset)