In [None]:
timesteps = 28
startDay = 0

In [None]:
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt
import scipy as sc
import gc #importing garbage collector
import time
import sys
from scipy import signal
from itertools import chain


import warnings
warnings.filterwarnings('ignore')

%matplotlib inline  

SEED = 42
#Pandas - Displaying more rorws and columns
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df_train = pd.read_csv('m5-forecasting-accuracy/sales_train_evaluation.csv')
df_prices = pd.read_csv('m5-forecasting-accuracy/sell_prices.csv')
df_days = pd.read_csv('m5-forecasting-accuracy/calendar.csv')

df_train = reduce_mem_usage(df_train)
df_prices = reduce_mem_usage(df_prices)
df_days = reduce_mem_usage(df_days)

In [None]:
df_train=df_train.T

In [None]:
df_train.iloc[:10,:10]

In [None]:
df_train.shape

In [None]:
df_days.tail(10)

In [None]:
df_days.shape

## Feature Engineering

In [None]:
df_days['is_workday'] = 0
df_days['is_workday'].loc[df_days['wday']>2] =1
df_days['is_workday'] = df_days['is_workday'].astype(np.int8)


In [None]:
df_days['is_event_day'] = [1 if x ==False else 0 for x in df_days['event_name_1'].isnull()] 
df_days['is_event_day'] = df_days['is_event_day'].astype(np.int8)

In [None]:
df_days["date"] = pd.to_datetime(df_days['date'])

In [None]:
df_days['week'] = df_days["date"].dt.week
df_days['week'] = df_days['week'].astype(np.int8)


In [None]:
df_days['num_events_week'] = df_days.groupby(by=['year','week'])['is_event_day'].transform('sum')
df_days['num_events_week'] = df_days['num_events_week'].astype(np.int8)

In [None]:
df_days['is_event_week'] = [1 if x >0 else 0 for x in df_days['num_events_week']]
df_days['is_event_week'] = df_days['is_event_week'].astype(np.int8)

In [None]:
df_days.info()

In [None]:
df_days.set_index('date', inplace=True)

In [None]:
day_after_event = df_days[df_days['is_event_day']==1].index.shift(1,freq='D')
df_days['is_event_day_after'] = 0
df_days['is_event_day_after'][df_days.index.isin(day_after_event)] = 1
df_days['is_event_day_after'] = df_days['is_event_day_after'].astype(np.int8)

del day_after_event

In [None]:
day_before_event = df_days[df_days['is_event_day']==1].index.shift(-1,freq='D')
df_days['is_event_day_before'] = 0
df_days['is_event_day_before'][df_days.index.isin(day_before_event)] = 1
df_days['is_event_day_before'] = df_days['is_event_day_before'].astype(np.int8)

del day_before_event

In [None]:
df_days.loc[:, "is_sport_event"] = ((df_days["event_type_1"] == "Sporting") | (df_days["event_type_2"] == "Sporting")).astype("int8")
df_days.loc[:, "is_cultural_event"] = ((df_days["event_type_1"] == "Cultural") | (df_days["event_type_2"] == "Cultural")).astype("int8")
df_days.loc[:, "is_national_event"] = ((df_days["event_type_1"] == "National") | (df_days["event_type_2"] == "National")).astype("int8")
df_days.loc[:, "is_religious_event"] = ((df_days["event_type_1"] == "Religious") | (df_days["event_type_2"] == "Religious")).astype("int8")

In [None]:
df_days[['wm_yr_wk','month','year']].head()

In [None]:
gc.collect()

In [None]:
df_train_full = df_train.copy()

## Generating Train and Test Data Option a: Limited features

In [None]:
startDay = 0
timesteps = 14

In [None]:
daysBeforeEventTest = df_days['is_event_day_before'][1941:1969]
daysBeforeEvent = df_days['is_event_day_before'][startDay:1941]
daysBeforeEvent.index = df_train.iloc[6:,:].index[startDay:1941]

In [None]:
df_final = pd.concat([df_train, daysBeforeEvent], axis = 1)
df_final.columns

In [None]:
df_final = df_final[startDay:]

In [None]:
#Feature Scaling
#Scale the features using min-max scaler in range 0-1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0, 1))
dt_scaled = scaler.fit_transform(df_final.iloc[6:,:])

In [None]:
dt_scaled.shape

In [None]:
X_train = []
y_train = []
for i in range(timesteps, 1941 - startDay):
    X_train.append(dt_scaled[i-timesteps:i])
    y_train.append(dt_scaled[i][0:dt_scaled.shape[1]-1]) 


In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
print('Shape of X_train :'+str(X_train.shape))
print('Shape of X_train :'+str(y_train.shape))

In [None]:
gc.collect()

## Submission file

In [None]:
inputs = df_final[-timesteps:]
inputs = scaler.transform(inputs)


In [None]:
df_final[-timesteps:].shape

In [None]:
gc.collect()

## Modeling

In [None]:
from keras import backend as K

In [None]:
# defining rmse as loss function
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
del  df_final, df_train_full, df_days, df_train, dt_scaled, time, sys, signal, reduce_mem_usage

In [None]:
gc.collect()

In [None]:
# Importing the Keras libraries and packages
import tensorflow_probability as tfp
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout


In [None]:
# Importing the Keras libraries and packages
import tensorflow_probability as tfp
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

# Initialising the RNN
model = Sequential()

# Adding the first LSTM layer and some Dropout regularisation
layer_1_units=40
model.add(LSTM(units = layer_1_units, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))

# Adding a second LSTM layer and some Dropout regularisation
layer_2_units=400
model.add(LSTM(units = layer_2_units, return_sequences = True))
model.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
layer_3_units=400
model.add(LSTM(units = layer_3_units))
model.add(Dropout(0.2))

# Adding the output layer
model.add(Dense(units = y_train.shape[1]))

# Compiling the RNN
model.compile(optimizer = 'adam', loss = root_mean_squared_error)
# alternative loss 'mse' or wrmsse

In [None]:
plt.plot(fit.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

In [None]:
model.layers

In [None]:
gc.collect()

## Modeling with Limited Features

In [None]:
X_test = []
X_test.append(inputs[0:timesteps])
X_test = np.array(X_test)
predictions = []

for j in range(timesteps,timesteps + 28):
    predicted_volume = model.predict(X_test[0,j - timesteps:j].reshape(1, timesteps, 30491))
    testInput = np.column_stack((np.array(predicted_volume), daysBeforeEventTest[0 + j - timesteps]))
    X_test = np.append(X_test, testInput).reshape(1,j + 1,30491)
    predicted_volume = scaler.inverse_transform(testInput)[:,0:30490]
    predictions.append(predicted_volume)

In [None]:

submission = submission.T
    
submission = pd.concat((submission, submission), ignore_index=True)

sample_submission = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')
    
idColumn = sample_submission[["id"]]
    
submission[["id"]] = idColumn  

cols = list(submission.columns)
cols = cols[-1:] + cols[:-1]
submission = submission[cols]

colsdeneme = ["id"] + [f"F{i}" for i in range (1,29)]

submission.columns = colsdeneme

submission.to_csv("submission_evaluation_newloss_nodrop.csv", index=False)