In [1]:
import json
import math
import os
from pprint import pprint
import numpy as np
import tensorflow as tf


# Loading data from memory
## Create the dataset

In [2]:
N_POINTS = 10
X = tf.constant(range(N_POINTS), dtype=tf.float32)
Y = 2 * X + 10
print(X)
print(Y)

tf.Tensor([0. 1. 2. 3. 4. 5. 6. 7. 8. 9.], shape=(10,), dtype=float32)
tf.Tensor([10. 12. 14. 16. 18. 20. 22. 24. 26. 28.], shape=(10,), dtype=float32)


Implement a function take input and return a tf.data.Dataset:
* Number of passes over the dataset to train on: epochs
* size of the batches: batch_size

** The last batch may not contain the exact number of elements as batch_size
** To discard the last batch:
```python
dataset = dataset.batch(batch_size, drop_remainder=True)
```

In [3]:
def create_dataset(X, Y, epochs, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((X,Y))
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder=True)
    return dataset

In [4]:
BATCH_SIZE = 3
EPOCH = 2

dataset = create_dataset(X,Y,EPOCH,BATCH_SIZE)

for i, (x,y) in enumerate(dataset):
    print("x:{}".format(x.numpy()), "y:{}".format(y.numpy()))
    assert len(x) == BATCH_SIZE
    assert len(y) == BATCH_SIZE

x:[0. 1. 2.] y:[10. 12. 14.]
x:[3. 4. 5.] y:[16. 18. 20.]
x:[6. 7. 8.] y:[22. 24. 26.]
x:[9. 0. 1.] y:[28. 10. 12.]
x:[2. 3. 4.] y:[14. 16. 18.]
x:[5. 6. 7.] y:[20. 22. 24.]


# Create Loss Function and Gradients

In [6]:
def loss_mse(X,Y,w0,w1):
    Y_hat = w0 * X +w1
    errors = (Y_hat - Y) ** 2 
    return tf.reduce_mean(errors)

In [9]:
def compute_gradients(X, Y, w0, w1):
    with tf.GradientTape() as tape:
        loss = loss_mse(X, Y, w0, w1)
    return tape.gradient(loss, [w0, w1])

## Training Loop
Iterate directly on the tf.data.dataset generated by create_dataset function

In [10]:
EPOCHS = 250
BATCH_SIZE = 2
LEARNING_RATE = 0.02

MSG = "STEP {step} - loss: {loss}, w0: {w0}, w1: {w1}\n"

w0 = tf.Variable(0.0)
w1 = tf.Variable(0.0)

dataset = create_dataset(X, Y, epochs= EPOCHS, batch_size= BATCH_SIZE)

for step, (X_batch, Y_batch) in enumerate(dataset):
    
    dw0, dw1 = compute_gradients(X_batch, Y_batch, w0, w1)
    w0.assign_sub(dw0 * LEARNING_RATE)
    w1.assign_sub(dw1 * LEARNING_RATE)
    
    if step % 100 == 0:
        loss = loss_mse(X_batch, Y_batch, w0, w1)
        print(MSG.format(step = step, loss=loss, w0=w0.numpy(), w1=w1.numpy()))
        
assert loss < 0.0001
assert abs(w0 - 2) < 0.001
assert abs(w1 - 10) < 0.001

STEP 0 - loss: 109.76800537109375, w0: 0.23999999463558197, w1: 0.4399999976158142

STEP 100 - loss: 9.363959312438965, w0: 2.55655837059021, w1: 6.674341678619385

STEP 200 - loss: 1.393267273902893, w0: 2.2146825790405273, w1: 8.717182159423828

STEP 300 - loss: 0.20730558037757874, w0: 2.082810878753662, w1: 9.505172729492188

STEP 400 - loss: 0.03084510937333107, w0: 2.03194260597229, w1: 9.809128761291504

STEP 500 - loss: 0.004589457996189594, w0: 2.012321710586548, w1: 9.926374435424805

STEP 600 - loss: 0.0006827632314525545, w0: 2.0047526359558105, w1: 9.971602439880371

STEP 700 - loss: 0.00010164896957576275, w0: 2.0018346309661865, w1: 9.989042282104492

STEP 800 - loss: 1.5142451957217418e-05, w0: 2.000706911087036, w1: 9.995771408081055

STEP 900 - loss: 2.256260358990403e-06, w0: 2.0002737045288086, w1: 9.998367309570312

STEP 1000 - loss: 3.3405058275093324e-07, w0: 2.000105381011963, w1: 9.999371528625488

STEP 1100 - loss: 4.977664502803236e-08, w0: 2.000040054321289,

# Loading data from disk
## Loading csv files

### Use tf.data to read csv file
tf.data API reads csv by tf.data.experimental.make_batchd_features_dataset

step 1: defind feature names into list: CSV_COLUMNS; default values into list DEFAULTS


In [11]:
CSV_COLUMNS = [
    'fare_amount',
    'pickup_datetime',
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count',
    'key' 
]

LABEL_COLUMN = 'fare_amount'

DEFAULTS = [[0.0], ['na'],[0.0],[0.0],[0.0],[0.0],[0.0], ['na']]

step 2: call make_csv_dataset into its own function that will take only the file pattern

In [12]:
def create_dataset(pattern):
    return tf.data.experimental.make_csv_dataset(pattern, 1, CSV_COLUMNS, DEFAULTS)

In [16]:
tempds = create_dataset('dataset')
print(tempds)

<PrefetchDataset shapes: OrderedDict([(fare_amount, (1,)), (pickup_datetime, (1,)), (pickup_longitude, (1,)), (pickup_latitude, (1,)), (dropoff_longitude, (1,)), (dropoff_latitude, (1,)), (passenger_count, (1,)), (key, (1,))]), types: OrderedDict([(fare_amount, tf.float32), (pickup_datetime, tf.string), (pickup_longitude, tf.float32), (pickup_latitude, tf.float32), (dropoff_longitude, tf.float32), (dropoff_latitude, tf.float32), (passenger_count, tf.float32), (key, tf.string)])>
