# How to load csv data in tensorflow?

In [36]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [42]:
wine_path = './winequality-red.csv'

In [18]:
wines = pd.read_csv('winequality-red.csv')

# First way

1. Read the csv as a pandas dataframe.
2. Convert the dataframe to a dictionary.
3. Create tensor slices from the created dictionary. 

In [29]:
wine_slices = tf.data.Dataset.from_tensor_slices(dict(wines))
wine_slices=wine_slices.batch(1)

In [30]:
for feature_batch in wine_slices.take(1):
    for key, value in feature_batch.items():
        print("{}: {}".format(key, value))

fixed acidity: [7.4]
volatile acidity: [0.7]
citric acid: [0.]
residual sugar: [1.9]
chlorides: [0.076]
free sulfur dioxide: [11.]
total sulfur dioxide: [34.]
density: [0.9978]
pH: [3.51]
sulphates: [0.56]
alcohol: [9.4]
quality: [5]


# Second Way

The second way is useful for scaling up to a large set of files or when one needs a loader that integrates with Tensorflow and the tf.data API.

The implementation involves using the tf.data.experimental.make_csv_dataset function.

The only column you need to identify explicitly is the one with the value that the model is intended to predict (named label_column here).


In [39]:
# experimental.make_csv_dataset

labels = list(range(0,10))





In [40]:
labels

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [55]:
def get_dataset(file_path, label_column, **kwargs):
    """
    Creates a tf dataset from csv in file_path.
    
    file_path : The path to the csv file including the csv.
    
    label_columns : The target column label. 

    """
    
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size = 5,
        label_name = label_column,
        na_value="?",
        num_epochs=1,
        **kwargs)
    
    return dataset

In [56]:
wine_tf_data = get_dataset(file_path = wine_path, label_column = 'quality' )

In [46]:
wine_data

<PrefetchDataset shapes: (OrderedDict([(fixed acidity, (None,)), (volatile acidity, (None,)), (citric acid, (None,)), (residual sugar, (None,)), (chlorides, (None,)), (free sulfur dioxide, (None,)), (total sulfur dioxide, (None,)), (density, (None,)), (pH, (None,)), (sulphates, (None,)), (alcohol, (None,))]), (None,)), types: (OrderedDict([(fixed acidity, tf.float32), (volatile acidity, tf.float32), (citric acid, tf.float32), (residual sugar, tf.float32), (chlorides, tf.float32), (free sulfur dioxide, tf.float32), (total sulfur dioxide, tf.float32), (density, tf.float32), (pH, tf.float32), (sulphates, tf.float32), (alcohol, tf.float32)]), tf.int32)>

In [53]:
def show_batch(dataset):
    for batch,label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key,value.numpy()))

In [57]:
show_batch(wine_tf_data)

fixed acidity       : [6.3 9.4 7.4 6.3 7.1]
volatile acidity    : [0.57  0.4   0.55  0.98  0.875]
citric acid         : [0.28 0.47 0.19 0.01 0.05]
residual sugar      : [2.1 2.5 1.8 2.  5.7]
chlorides           : [0.048 0.087 0.082 0.057 0.082]
free sulfur dioxide : [13.  6. 15. 15.  3.]
total sulfur dioxide: [49. 20. 34. 33. 14.]
density             : [0.99374 0.99772 0.99655 0.99488 0.99808]
pH                  : [3.41 3.15 3.49 3.6  3.4 ]
sulphates           : [0.6  0.5  0.68 0.46 0.52]
alcohol             : [12.8 10.5 10.5 11.2 10.2]


In [58]:
wine_tf_data.

<PrefetchDataset shapes: (OrderedDict([(fixed acidity, (None,)), (volatile acidity, (None,)), (citric acid, (None,)), (residual sugar, (None,)), (chlorides, (None,)), (free sulfur dioxide, (None,)), (total sulfur dioxide, (None,)), (density, (None,)), (pH, (None,)), (sulphates, (None,)), (alcohol, (None,))]), (None,)), types: (OrderedDict([(fixed acidity, tf.float32), (volatile acidity, tf.float32), (citric acid, tf.float32), (residual sugar, tf.float32), (chlorides, tf.float32), (free sulfur dioxide, tf.float32), (total sulfur dioxide, tf.float32), (density, tf.float32), (pH, tf.float32), (sulphates, tf.float32), (alcohol, tf.float32)]), tf.int32)>

In [16]:
for feature_batch, label_batch in wine_batches.take(1):
    print("'quality': {}".format(label_batch))
    print("features:")
    for key, value in feature_batch.items():
        print("  {!r:20s}: {}".format(key, value))

'quality': [6 5 5 5]
features:
  'fixed acidity'     : [9.  7.7 7.2 7.3]
  'volatile acidity'  : [0.4   0.54  0.725 0.835]
  'citric acid'       : [0.41 0.26 0.05 0.03]
  'residual sugar'    : [2.   1.9  4.65 2.1 ]
  'chlorides'         : [0.058 0.089 0.086 0.092]
  'free sulfur dioxide': [15. 23.  4. 10.]
  'total sulfur dioxide': [ 40. 147.  11.  19.]
  'density'           : [0.99414 0.99636 0.9962  0.9966 ]
  'pH'                : [3.22 3.26 3.41 3.39]
  'sulphates'         : [0.6  0.59 0.39 0.47]
  'alcohol'           : [12.2  9.7 10.9  9.6]
