In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.compat.v1 as tfv1

from sklearn.preprocessing import MinMaxScaler

import time
import os

from datetime import datetime, timedelta
from IPython.display import display

In [2]:
tfv1.disable_eager_execution()
tfv1.disable_v2_behavior()
tfv1.executing_eagerly()

Instructions for updating:
non-resource variables are not supported in the long term


False

In [3]:
df = pd.read_csv("universe_dataset.csv", usecols=["gross_amount", "txn_date"])
df.head()

Unnamed: 0,txn_date,gross_amount
0,2020-05-19,391.5
1,2020-06-19,365.0
2,2020-06-26,156.0
3,2020-07-16,75.0
4,2020-07-17,786.6


In [4]:
%%time

df['txn_date'] = pd.to_datetime(df['txn_date'])

df.sort_values('txn_date', inplace=True, ascending=True)
df = df.reset_index(drop=True)

print('Number of rows and columns after removing missing values:', df.shape)
print('The time series starts from: ', df['txn_date'].min())
print('The time series ends on: ', df['txn_date'].max())
print("\n")

Number of rows and columns after removing missing values: (18, 2)
The time series starts from:  2020-05-19 00:00:00
The time series ends on:  2020-09-03 00:00:00


CPU times: user 4.1 ms, sys: 9 µs, total: 4.11 ms
Wall time: 3.7 ms


In [5]:
# Split into training, validation and test datasets.
# Since it's timeseries we should do it by date.
test_cutoff_date = df['txn_date'].max() - timedelta(days=7)
val_cutoff_date = test_cutoff_date - timedelta(days=14)

df_test = df[df['txn_date'] > test_cutoff_date]
df_val = df[(df['txn_date'] > val_cutoff_date) & (df['txn_date'] <= test_cutoff_date)]
df_train = df[df['txn_date'] <= val_cutoff_date]

#check out the datasets
print('Test dates: {} to {}'.format(df_test['txn_date'].min(), df_test['txn_date'].max()))
print('Validation dates: {} to {}'.format(df_val['txn_date'].min(), df_val['txn_date'].max()))
print('Train dates: {} to {}'.format(df_train['txn_date'].min(), df_train['txn_date'].max()))

df_train.shape, df_val.shape, df_test.shape

Test dates: 2020-08-29 00:00:00 to 2020-09-03 00:00:00
Validation dates: 2020-08-18 00:00:00 to 2020-08-25 00:00:00
Train dates: 2020-05-19 00:00:00 to 2020-08-12 00:00:00


((8, 2), (4, 2), (6, 2))

In [6]:
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    if isinstance(series, pd.DataFrame):
        series = series["gross_amount"].values
    
    series = tf.expand_dims(series, axis=-1)
       
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
#     ds = ds.shuffle(shuffle_buffer)
    ds = ds.map(lambda w: (w[:-1], w[1:]))
    
    return ds.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

In [7]:
tf.keras.backend.clear_session()

# Variables
batch_size = 5
window_size=5
shuffle_buffer_size=1

In [8]:
# tf Graph input

X = tfv1.placeholder(tf.float32, [None, batch_size, 1])
y = tfv1.placeholder(tf.float32, [None, batch_size, 1])

In [9]:
def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(filters=60, kernel_size=5, strides=1, padding="causal", activation="relu", input_shape=[None, 1]),

        tf.keras.layers.LSTM(60, return_sequences=True),
        tf.keras.layers.LSTM(60, return_sequences=True),
        
        tf.keras.layers.Dense(30, activation="relu"),
        tf.keras.layers.Dense(10, activation="relu"),
        tf.keras.layers.Dense(1),

        tf.keras.layers.Lambda(lambda x: x * 400)])
        
    return model

In [10]:
# construct the model
logits = create_model()(X)

# Define loss and optimizer
loss_op = tf.keras.losses.mean_squared_error(y, logits)
optimizer = tfv1.train.AdamOptimizer()

train_op = optimizer.minimize(loss_op)

# Evaluation model 
correct_pred = tf.equal(y, logits)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# initialize the variable (i.e assign their default values)
init = tfv1.global_variables_initializer()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [14]:
sess_config = tfv1.ConfigProto(allow_soft_placement=True, gpu_options=tfv1.GPUOptions(per_process_gpu_memory_fraction=0.5, allow_growth=True))
        

train_set = windowed_dataset(df_train, window_size, batch_size, shuffle_buffer_size)
iterator = tfv1.data.make_initializable_iterator(train_set)

test_set = windowed_dataset(df_test, 3, batch_size, shuffle_buffer_size)
test_iterator = tfv1.data.make_initializable_iterator(test_set)

print_terminal = "Step {}, Minibatch Loss = {:.4f}, Training Loss {:.3f}"
    
# start training
with tfv1.Session(config=sess_config) as sess:
    # Run the initializer
    sess.run(init)
    sess.run(iterator.initializer)
    
    for step in range(1, 5):
        try:
            while True:
                x_batch_train, y_batch_train = sess.run(iterator.get_next())
                
                # Run optimization op (backprop)
                sess.run(train_op, feed_dict={X: x_batch_train, y: y_batch_train})
                
                loss, acc = sess.run([loss_op, accuracy], feed_dict={X: x_batch_train, y: y_batch_train})
                print(loss)
                print(acc)
#                 print(print_terminal.format(step, float(loss), float(acc)))
                
        except tf.errors.OutOfRangeError:
            pass
    
    print("Optimization Finished")
    
#     # Calculate accuracy 
#     sess.run(iterator.initializer)
    
#     while True:
#         for x_batch_test, y_batch_test in sess.run(test_iterator.get_next()):
#             print("Testing Accuracy: ", sess.run(accuracy, feed_dict={X: x_batch_test, y: y_batch_test}))

#     except tf.errors.OutOfRangeError:
#         pass       
        

[[134147.73    24400.607    5518.2944 623343.9      6858.451 ]
 [ 24749.979    4736.143  604131.5      5019.226  926342.56  ]
 [  5834.1943 606796.2      3941.216  913663.6    245980.1   ]]
0.0
Optimization Finished
