In [None]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

Inspired from https://www.kaggle.com/code/dimitreoliveira/deep-learning-for-time-series-forecasting/notebook

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
# load the train.csv file into a pandas dataframe
df = pd.read_csv('TRAIN.csv')

### convert the dates columns to a single column as 'date' with associated 'sales'

In [None]:
#convert the dates columns to a single column as "Date " with associated sales
df = pd.melt(df, id_vars=['Item code', 'Category', 'State'], var_name='date', value_name='sales')

# Convert the 'Date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')

print(df.shape)

In [None]:
# choose dataframes with dates greater than 2022-02-01 and less than 2022-02-28
df = df[(df['date'] >= '2021-07-01') & (df['date'] <= '2022-01-31')]
print(df.shape)

### Encode the Item code, State and category

In [None]:
le_dict = {}

for col in ['Item code', 'State', 'Category']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

# Print the mappings
for col, le in le_dict.items():
    print(f"For column {col}:")
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(mapping)
    print("\n")

# df.head()

## EDA

In [None]:
#print unique values of the 'Item code' column
print("Unique values of the 'Item code' column:")
print(df['Item code'].unique())

print("Unique values of the 'Category' column:")
print(df['Category'].unique())

print("Unique values of the 'State' column:")
print(df['State'].unique())

In [None]:
# # Visualise this time series data using plotly
fig = px.line(df, x='date', y='sales', title='Sales over time')
fig.show()


In [None]:
print('Min date from train set: %s' % df['date'].min().date())
print('Max date from train set: %s' % df['date'].max().date())

In [None]:
print('Percentage of entries with sales = 0: %s' % ((df['sales'] == 0).sum() / len(df) * 100))

In [None]:
# plot the Item code vs sales
fig = px.scatter(df, x='Item code', y='sales', title='Item code vs sales')
fig.show()

In [None]:
# plot the Category vs sales
fig = px.scatter(df, x='Category', y='sales', title='Category vs sales')
fig.show()

### Convert data to training time series format

In [None]:
# Group by date, item code, category and state and calculate the mean sales
df = df.sort_values('date').groupby(['Item code','Category','State','date'], as_index=False)
df = df.agg({'sales':['mean']})
df.columns = ['Item code','Category','State','date','sales']

In [None]:
def series_to_supervised(data, window=1, lag=1, dropnan=True):
    # cols stores the shifted dataframes
    shifted_dataframes, shifted_column_names = list(), list()
    # Input sequence (t-n, ... t-1)
    for i in range(window, 0, -1):
        # we are shifting the dataframe rows by i steps and are storing the new dataframes and there column names in a list
        shifted_dataframes.append(data.shift(i))
        shifted_column_names += [('%s(t-%d)' % (col, i)) for col in data.columns]
    # Current timestep (t=0)
    shifted_dataframes.append(data)
    shifted_column_names += [('%s(t)' % (col)) for col in data.columns]
    # Target timestep (t=lag)
    shifted_dataframes.append(data.shift(-lag))
    shifted_column_names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
    # Put it all together
    agg = pd.concat(shifted_dataframes, axis=1)
    agg.columns = shifted_column_names
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

### Convert the timeseries data into lagged features data with target sales after 90 days

In [None]:
window = 59
lag = 90
series = series_to_supervised(df.drop('date', axis=1), window=window, lag=lag)
print(series.shape)
series.head()

### Avoid the scenario  when we are trying to predict the sale of a different product using different product

In [None]:
last_Item_code = 'Item code(t-%d)' % window
last_category = 'Category(t-%d)' % window
last_state = 'State(t-%d)' % window

series = series[(series['Item code(t)'] == series[last_Item_code])]
series = series[(series['Item code(t+%d)'%lag] == series[last_Item_code])]
series = series[(series['Category(t)'] == series[last_category])]
series = series[(series['Category(t+%d)'%lag] == series[last_category])]
series = series[(series['State(t)'] == series[last_state])]
series = series[(series['State(t+%d)'%lag] == series[last_state])]

print(series.shape)
series.head()

### Drop the Item code, Category and state columns as they are not needed to model time series

In [None]:
columns_to_drop = [('%s(t+%d)' % (col, lag)) for col in ['Item code', 'Category', 'State']]
for i in range(window, 0, -1):
    columns_to_drop += [('%s(t-%d)' % (col, i)) for col in ['Item code', 'Category', 'State']]
series.drop(columns_to_drop, axis=1, inplace=True)
series.drop(['Item code(t)', 'Category(t)', 'State(t)'], axis=1, inplace=True)
print(series.shape)
series.head()

In [None]:
labels_col = 'sales(t+%d)' % lag
labels = series[labels_col]
series = series.drop(labels_col, axis=1)
X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.3, random_state=42)
print('Train set shape', X_train.shape)
print('Validation set shape', X_valid.shape)
X_train.head()

In [None]:
X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
print('Train set shape', X_train_series.shape)
print('Validation set shape', X_valid_series.shape)

In [None]:
epochs = 10
batch = 256
lr = 0.0003
adam = optimizers.Adam(lr)

In [None]:
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
model_lstm.add(Dense(1))
model_lstm.compile(loss='mse', optimizer=adam)
model_lstm.summary()

In [None]:
from keras.callbacks import ModelCheckpoint

# define the checkpoint
filepath = "best_model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# fit the model
lstm_history = model_lstm.fit(X_train_series, Y_train, validation_data=(X_valid_series, Y_valid), epochs=epochs, verbose=2, callbacks=[checkpoint])

# Plot training & validation loss values
plt.plot(lstm_history.history['loss'])
plt.plot(lstm_history.history['val_loss'])
plt.title('Model loss')

# save the model to disk
# model_lstm.save('model_lstm.h5')

In [None]:
# load the model from disk
from keras.models import load_model
model_lstm = load_model('best_model.hdf5')

In [None]:
lstm_train_pred = model_lstm.predict(X_train_series)
lstm_valid_pred = model_lstm.predict(X_valid_series)
print('Train rmse:', mean_squared_error(Y_train, lstm_train_pred))
print('Validation rmse:', mean_squared_error(Y_valid, lstm_valid_pred))

In [None]:
def wmape(y_true, y_pred):
    total = 0
    count = 0
    for i in range(0, len(y_true)):
        y_true_chunk = y_true[i]
        y_pred_chunk = y_pred[i]
        total += np.abs(y_true_chunk - y_pred_chunk)
        count += np.abs(y_true_chunk)
    return 100 * total / count

print('Train WMAPE:', wmape(Y_train, lstm_train_pred)) 
print('Validation WMAPE:', wmape(Y_valid, lstm_valid_pred))