#### Read a csv file and convert it to a TF binary
https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/

In [1]:
import tensorflow as tf
import pandas as pd

In [2]:
df = pd.read_csv("/home/tom/data/haganCaseStudy1.csv")
print('{} records and {} columns'.format(df.shape[0], df.shape[1]))

67 records and 3 columns


In [3]:
# Get each feature, and the Target, in its own series
df['num'] = range(1, len(df) + 1)             # Add the example number

In [4]:
num = df['num'].astype('float')
v1 = df['v1']
v2 = df['v2']
Y = df['y']

In [5]:
# Create a RecordWriter and load Example. Example requires an iterable, so one write 
# will take the whole column from the dataframe
# The output is a binary file
writer = tf.python_io.TFRecordWriter("/home/tom/data/haganCaseStudy1.tfrecords")

example = tf.train.Example(features=tf.train.Features(
        feature={
            'num': tf.train.Feature(float_list=tf.train.FloatList(value=num)),
            'v1': tf.train.Feature(float_list=tf.train.FloatList(value=v1)),
            'v2': tf.train.Feature(float_list=tf.train.FloatList(value=v2)),
            'Y': tf.train.Feature(float_list=tf.train.FloatList(value=Y))
        }))

serialized = example.SerializeToString()
writer.write(serialized)
writer.close()

#### Now read it in

In [10]:
import random
DATA_LOC  = "/home/tom/data/"
DATA_FILE = "haganCaseStudy1.tfrecords"
BATCH_SIZE    = 2
TEST_PCT      = 0.25

In [11]:
# "size" is the number of records in the input file
# "pct" is the percent of records to make Test records
def create_mask(size, pct):
    mask = [0 for x in range(size)]
    test_set_size = int(size*pct)
    mask[:test_set_size] = [1] * test_set_size
    random.shuffle(mask)
    return mask

In [12]:
# Read the data and load Features (v1,v2) and Target (y)
for serialized_rec in tf.python_io.tf_record_iterator(DATA_LOC + DATA_FILE):
    example = tf.train.Example()
    example.ParseFromString(serialized_rec)
    a = example.features.feature['num'].float_list.value
    b = example.features.feature['v1'].float_list.value
    c = example.features.feature['v2'].float_list.value
    d = example.features.feature['Y'].float_list.value

In [13]:
# Create a single Tensor with the data
num = tf.placeholder("float", shape=[None])
v1  = tf.placeholder("float", shape=[None])
v2  = tf.placeholder("float", shape=[None])
y   = tf.placeholder("float", shape=[None])

t_list = [num,v1,v2,y]
T = tf.pack(t_list, axis=1)          # Merge into a single tensor

# Create a mask that randomizes the test/train sets
mask = create_mask(len(a), TEST_PCT)

# Split the data into Train and Test
train, test = tf.dynamic_partition(T, mask, 2)

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    tr,te = sess.run([train, test], feed_dict = {num:a, v1:b, v2:c, y:d})

In [14]:
print('Training set rows: {}'.format(tr.shape[0]))
print('Testing  set rows: {}'.format(te.shape[0]))

Training set rows: 51
Testing  set rows: 16
