In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
plt.style.use('ggplot')
plt.rc('patch', force_edgecolor=True,edgecolor='black')
plt.rc('hist', bins='auto')

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten, Dropout

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

Using TensorFlow backend.


In [None]:
train = pd.read_csv('wrangled_data/training_post_wrangle_small.csv')

test_full = pd.read_csv('wrangled_data/testing_post_wrangle_small.csv')

test_final = test_full.loc[:,['ID', 'shop_id', 'item_id']]

In [None]:
features = ['date_block_num', 'mean_item', 'mode_item', 'item_name', 'item_category_id', 'item_category_name', 'shop_name']

target = ['item_cnt_month']

values = ['item_cnt_month', 'mean_item', 'mode_item', 'item_name', 'item_category_id', 'item_category_name', 'shop_name']

In [9]:
train.columns

Index(['ID', 'shop_id', 'item_id', 'year', 'month', 'item_cnt_month',
       'date_block_num', 'mean_item', 'mode_item', 'item_name',
       'item_category_id', 'item_category_name', 'shop_name'],
      dtype='object')

In [18]:
train = train.set_index(['ID', 'shop_id', 'item_id', 'year', 'month']).sort_index(axis=0, level=['ID', 'year', 'month'])

In [19]:
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,item_cnt_month,date_block_num,mean_item,mode_item,item_name,item_category_id,item_category_name,shop_name
ID,shop_id,item_id,year,month,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,5,5037,2013,1,0.0,0.0,0.382353,0.0,1195,19,11,3
0,5,5037,2013,2,0.0,1.0,0.382353,0.0,1195,19,11,3
0,5,5037,2013,3,0.0,2.0,0.382353,0.0,1195,19,11,3
0,5,5037,2013,4,0.0,3.0,0.382353,0.0,1195,19,11,3
0,5,5037,2013,5,0.0,4.0,0.382353,0.0,1195,19,11,3


In [49]:
train = train.pivot_table(index = ['ID','shop_id','item_id'], 
                  values = values, columns =['date_block_num'])

In [45]:
# np.expand_dims(train.values, axis=2).shape

In [46]:
# train.droplevel(['shop_id', 'item_id', 'year', 'month'])

In [47]:
# train.to_numpy()

In [48]:
# train.reset_index().set_index('ID').loc[0].to_numpy()

In [50]:
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,...,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name
Unnamed: 0_level_1,Unnamed: 1_level_1,date_block_num,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0
ID,shop_id,item_id,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
0,5,5037,19,19,19,19,19,19,19,19,19,19,...,3,3,3,3,3,3,3,3,3,3
1,5,5320,55,55,55,55,55,55,55,55,55,55,...,3,3,3,3,3,3,3,3,3,3
2,5,5233,19,19,19,19,19,19,19,19,19,19,...,3,3,3,3,3,3,3,3,3,3
3,5,5232,23,23,23,23,23,23,23,23,23,23,...,3,3,3,3,3,3,3,3,3,3
4,5,5268,20,20,20,20,20,20,20,20,20,20,...,3,3,3,3,3,3,3,3,3,3


In [56]:
train.reset_index(drop=True, inplace=True)

In [57]:
train.head()

Unnamed: 0_level_0,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,item_category_id,...,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name,shop_name
date_block_num,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0
0,19,19,19,19,19,19,19,19,19,19,...,3,3,3,3,3,3,3,3,3,3
1,55,55,55,55,55,55,55,55,55,55,...,3,3,3,3,3,3,3,3,3,3
2,19,19,19,19,19,19,19,19,19,19,...,3,3,3,3,3,3,3,3,3,3
3,23,23,23,23,23,23,23,23,23,23,...,3,3,3,3,3,3,3,3,3,3
4,20,20,20,20,20,20,20,20,20,20,...,3,3,3,3,3,3,3,3,3,3


In [83]:
print(train.columns[0], train.columns[34], train.columns[68], train.columns[34+68], 
     train.columns[2*68], train.columns[5*34], train.columns[6*34])

('item_category_id', 0.0) ('item_category_name', 0.0) ('item_cnt_month', 0.0) ('item_name', 0.0) ('mean_item', 0.0) ('mode_item', 0.0) ('shop_name', 0.0)


In [58]:
train.shape

(214200, 238)

In [91]:
# pd.DataFrame([train.iloc[:,33], train.iloc[:,2*33], train.iloc[:,3*33],
#              train.iloc[:,4*33], train.iloc[:,5*33], train.iloc[:,6*33],
#              train.iloc[:,7*33]]).T

Unnamed: 0,"(item_category_id, 33.0)","(item_category_name, 32.0)"
0,19,11
1,55,39
2,19,11
3,23,15
4,20,12
...,...,...
214195,55,39
214196,64,47
214197,55,39
214198,40,30


In [59]:
scaler = RobustScaler().fit(train)
dataset_scaled = scaler.transform(train)

In [62]:
# X we will keep all columns execpt the last one 
X_train = np.expand_dims(dataset_scaled[:,:-1],axis = 2)
# the last column is our label
y_train = dataset_scaled[:,-1:]

# for test we keep all the columns execpt the first one
X_test = np.expand_dims(dataset_scaled[:,1:],axis = 2)

# lets have a look on the shape 
print(X_train.shape,y_train.shape,X_test.shape)

(214200, 237, 1) (214200, 1) (214200, 237, 1)


In [63]:
model_lstm = tf.keras.Sequential()
model_lstm.add(tf.keras.layers.LSTM(128, input_shape = (X_train.shape[1], X_train.shape[2]), return_sequences=True))
model_lstm.add(tf.keras.layers.LeakyReLU(alpha=0.5))
model_lstm.add(tf.keras.layers.LSTM(128, return_sequences=True))
model_lstm.add(tf.keras.layers.LeakyReLU(alpha=0.5))
model_lstm.add(tf.keras.layers.Dropout(0.3))
model_lstm.add(tf.keras.layers.LSTM(64, return_sequences=False))
model_lstm.add(tf.keras.layers.Dropout(0.3))
model_lstm.add(tf.keras.layers.Dense(1))

model_lstm.compile(loss = 'mse',optimizer = 'adam', metrics = ['mean_squared_error']) #Starting learning rate
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 237, 128)          66560     
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 237, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 237, 128)          131584    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 237, 128)          0         
_________________________________________________________________
dropout (Dropout)            (None, 237, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0

In [65]:
history_lstm = model_lstm.fit(X_train,y_train,batch_size = 4096,epochs = 5)

MemoryError: Unable to allocate 387. MiB for an array with shape (214200, 237, 1) and data type float64

In [None]:
# Plot the loss curves for training
plt.plot(history_lstm.history['loss'], color='b', label="Training loss")
plt.legend(loc='best', shadow=True)

In [None]:
# creating submission file 
submission_pfs = model_lstm.predict(X_test)
# we will keep every value between 0 and 20
submission_pfs = submission_pfs.clip(0,20)

In [None]:
fullset = np.concatenate((dataset_scaled, submission_pfs), 1)

In [None]:
fullset[:,1:].shape

In [None]:
submission_unscaled = scaler.inverse_transform(fullset[:,1:])

In [None]:
submission_unscaled[:,-1].shape

In [None]:
# creating dataframe with required columns 
submission = pd.DataFrame({'ID':test['ID'],'item_cnt_month':submission_unscaled[:,-1].ravel()})
# creating csv file from dataframe
submission.to_csv('submission_files/robustscale_feature_eng.csv',index = False)

In [None]:
submission.head(3)

In [None]:
submission.shape, test.shape