In [1]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
json_prepared = dataiku.Dataset("Json_prepared")
df = json_prepared.get_dataframe()




In [2]:
import math

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from datetime import timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
import time

import os

In [4]:
df.head()

Unnamed: 0,Timestamp,date,broker-102/,broker-104/,broker-106/,broker-108/,broker-110/,zook-102/,zook-104/,broker-102/data,broker-104/data,broker-106/data,broker-108/data,broker-110/data,zook-102/data,zook-104/data,broker-101/,broker-103/,broker-105/,broker-107/,broker-109/,zook-101/,zook-103/,zook-105/,broker-101/data,broker-103/data,broker-105/data,broker-107/data,broker-109/data,zook-101/data,zook-103/data,zook-105/data
0,1624601700,2021-06-25 06:15:00+00:00,16.318256,14.317784,15.499574,13.478442,10.100069,3.457606,3.497103,85.198139,74.491702,70.937154,73.32207,88.858424,0.278296,0.216338,15.759843,15.526538,10.510089,13.464054,9.834396,3.489473,3.470773,3.17612,81.194161,76.236908,73.175427,72.768512,90.265277,0.251977,0.236909,0.245222
1,1624602600,2021-06-25 06:30:00+00:00,16.201163,14.317455,15.596986,13.478452,10.100073,3.457596,3.498734,85.195519,74.540221,70.971025,73.390322,88.953916,0.278296,0.216338,15.798915,15.604688,10.588215,13.542199,9.893027,3.469877,3.470783,3.176225,81.216755,76.227287,73.233565,72.715046,90.343407,0.251977,0.236909,0.245222
2,1624603500,2021-06-25 06:45:00+00:00,16.240144,14.200271,15.479296,13.478466,10.021952,3.457639,3.498534,85.368338,74.687954,71.060131,73.450625,89.056802,0.278296,0.216338,15.875768,15.526571,10.480893,13.464078,9.971639,3.489372,3.470907,3.156653,81.287027,76.350551,73.217323,72.744789,90.583015,0.251977,0.236909,0.245222
3,1624604400,2021-06-25 07:00:00+00:00,16.166954,14.239343,15.479368,13.351567,10.002428,3.458865,3.498543,85.503684,74.622017,71.028285,73.530478,89.180835,0.278296,0.216338,15.875782,15.487523,10.519952,13.464083,9.971339,3.470134,3.470964,3.176167,81.417682,76.318796,73.396165,72.921757,90.711837,0.251977,0.236909,0.245222
4,1624605300,2021-06-25 07:15:00+00:00,16.240354,14.317498,15.597,13.405028,10.002437,3.497909,3.499302,85.652764,74.762854,71.062395,73.475029,89.232812,0.278296,0.216338,15.758903,15.45837,10.446685,13.542228,9.893218,3.470191,3.472156,3.176158,81.569016,76.359671,73.45635,72.810311,90.751683,0.251977,0.236909,0.245222


In [5]:

# Split into training, validation and test datasets.
# Since it's timeseries we should do it by date.
test_cutoff_date = df['date'].max() - timedelta(days=7)
val_cutoff_date = test_cutoff_date - timedelta(days=14)

df_test = df[df['date'] > test_cutoff_date]
df_val = df[(df['date'] > val_cutoff_date) & (df['date'] <= test_cutoff_date)]
df_train = df[df['date'] <= val_cutoff_date]

#check out the datasets
print('Test dates: {} to {}'.format(df_test['date'].min(), df_test['date'].max()))
print('Validation dates: {} to {}'.format(df_val['date'].min(), df_val['date'].max()))
print('Train dates: {} to {}'.format(df_train['date'].min(), df_train['date'].max()))

Test dates: 2021-07-29 13:15:00+00:00 to 2021-08-05 13:00:00+00:00
Validation dates: 2021-07-15 13:15:00+00:00 to 2021-07-29 13:00:00+00:00
Train dates: 2021-06-25 06:15:00+00:00 to 2021-07-15 13:00:00+00:00


In [6]:

# Goal of the model:
#  Predict Global_active_power at a specified time in the future.
#   Eg. We want to predict how much Global_active_power will be ten minutes from now.
#       We can use all the values from t-1, t-2, t-3, .... t-history_length to predict t+10


def create_ts_files(dataset, 
                    start_index, 
                    end_index, 
                    history_length, 
                    step_size, 
                    target_step, 
                    num_rows_per_file, 
                    data_folder):
    assert step_size > 0
    assert start_index >= 0
    
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    
    time_lags = sorted(range(target_step+1, target_step+history_length+1, step_size), reverse=True)
    col_names = [f'x_lag{i}' for i in time_lags] + ['y']
    start_index = start_index + history_length
    if end_index is None:
        end_index = len(dataset) - target_step
    
    rng = range(start_index, end_index)
    num_rows = len(rng)
    num_files = math.ceil(num_rows/num_rows_per_file)
    
    # for each file.
    print(f'Creating {num_files} files.')
    for i in range(num_files):
        filename = f'{data_folder}/ts_file{i}.pkl'
        
        if i % 10 == 0:
            print(f'{filename}')
            
        # get the start and end indices.
        ind0 = i*num_rows_per_file
        ind1 = min(ind0 + num_rows_per_file, end_index)
        data_list = []
        
        # j in the current timestep. Will need j-n to j-1 for the history. And j + target_step for the target.
        for j in range(ind0, ind1):
            indices = range(j-1, j-history_length-1, -step_size)
            data = dataset[sorted(indices) + [j+target_step]]
            
            # append data to the list.
            data_list.append(data)

        df_ts = pd.DataFrame(data=data_list, columns=col_names)
        df_ts.to_pickle(filename)
            
    return len(col_names)-1

In [7]:

%%time

global_active_power = df_train['Global_active_power'].values

# Scaled to work with Neural networks.
scaler = MinMaxScaler(feature_range=(0, 1))
global_active_power_scaled = scaler.fit_transform(global_active_power.reshape(-1, 1)).reshape(-1, )

history_length = 7*24*60  # The history length in minutes.
step_size = 10  # The sampling rate of the history. Eg. If step_size = 1, then values from every minute will be in the history.
                #                                       If step size = 10 then values every 10 minutes will be in the history.
target_step = 10  # The time step in the future to predict. Eg. If target_step = 0, then predict the next timestep after the end of the history period.
                  #                                             If target_step = 10 then predict 10 timesteps the next timestep (11 minutes after the end of history).

# The csv creation returns the number of rows and number of features. We need these values below.
num_timesteps = create_ts_files(global_active_power_scaled,
                                start_index=0,
                                end_index=None,
                                history_length=history_length,
                                step_size=step_size,
                                target_step=target_step,
                                num_rows_per_file=128*100,
                                data_folder='ts_data')

# I found that the easiest way to do time series with tensorflow is by creating pandas files with the lagged time steps (eg. x{t-1}, x{t-2}...) and 
# the value to predict y = x{t+n}. We tried doing it using TFRecords, but that API is not very intuitive and lacks working examples for time series.
# The resulting file using these parameters is over 17GB. If history_length is increased, or  step_size is decreased, it could get much bigger.
# Hard to fit into laptop memory, so need to use other means to load the data from the hard drive.

KeyError: 'Global_active_power'

In [0]:
# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

predictions_df = json_prepared_df # For this sample code, simply copy input to output


# Write recipe outputs
predictions = dataiku.Dataset("predictions")
predictions.write_with_schema(predictions_df)