# Preprocessing
---  

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
# from keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.python.keras.utils.data_utils import Sequence
# from tensorflow.keras.model.sequence import TimeseriesGenerator
from sklearn.decomposition import PCA

In [3]:
plt.style.use('ggplot')
plt.rc('patch', force_edgecolor=True,edgecolor='black')
plt.rc('hist', bins='auto')

In [4]:
train = pd.read_csv('wrangled_data/training_post_wrangle.csv')

In [5]:
features = ['date_block_num', 'mean_item', 'mode_item', 'item_name', 'item_category_id', 'item_category_name', 'shop_name']

target = ['item_cnt_month']

In [6]:
train.columns

Index(['shop_id', 'item_id', 'year', 'month', 'item_cnt_month',
       'date_block_num', 'mean_item', 'mode_item', 'item_name',
       'item_category_id', 'item_category_name', 'shop_name'],
      dtype='object')

In [7]:
train['date'] = pd.to_datetime(train[['year', 'month']].assign(DAY=1)).apply(lambda x: x.strftime('%Y-%m'))
train.drop(columns=['year', 'month'], inplace=True)
train.set_index(['shop_id', 'item_id', 'date'], inplace=True)
train.sort_index(inplace=True)

In [8]:
ind = train.index
X = train[features]
y = train[target]

In [9]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

In [10]:
tf.keras.preprocessing.sequence.TimeseriesGenerator(X, y, length=2, sampling_rate=1, batch_size=1)[0]

(array([[[0.        , 0.00215582, 0.        , 0.00135336, 0.48192771,
          0.48192771, 0.        ],
         [0.03030303, 0.00215582, 0.        , 0.00135336, 0.48192771,
          0.48192771, 0.        ]]]),
 array([[0.00967033]]))

In [11]:
X_train, X_tune, y_train, y_tune = train_test_split(X, y, test_size=0.20, random_state=123, shuffle=False)

In [12]:
win_length = 59*34*34
batch_size = 34*34
num_features = len(features)

train_generator = tf.keras.preprocessing.sequence.TimeseriesGenerator(X_train, y_train, length=win_length, sampling_rate=1, batch_size=batch_size)
tune_generator = tf.keras.preprocessing.sequence.TimeseriesGenerator(X_tune, y_tune, length=win_length, sampling_rate=1, batch_size=batch_size)

In [13]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(128, input_shape=(win_length, num_features), return_sequences=True))
model.add(tf.keras.layers.LeakyReLU(alpha=0.5))
model.add(tf.keras.layers.LSTM(128, return_sequences=True))
model.add(tf.keras.layers.LeakyReLU(alpha=0.5))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.LSTM(64, return_sequences=False))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(1))

In [18]:
train.shape

(14411682, 8)

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 68204, 128)        69632     
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 68204, 128)        0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 68204, 128)        131584    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 68204, 128)        0         
_________________________________________________________________
dropout (Dropout)            (None, 68204, 128)        0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                     patience=2,
                                                     mode='min')

model.compile(loss=tf.losses.MeanSquaredError(),
             optimizer=tf.optimizers.Adam(),
             metrics=[tf.metrics.MeanAbsoluteError()])

history = model.fit_generator(train_generator, epochs=1,
                             validation_data=tune_generator,
                             shuffle=False,
                             callbacks=[early_stopping])

Instructions for updating:
Please use Model.fit, which supports generators.
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 9915 steps, validate for 2435 steps


In [None]:
model.evaluate_generator(tune_generator, verbose=0)

In [None]:
predictions = model.predict_generator(tune_generator)

predictions.shape[0]