In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
train = pd.read_csv("../../Data/Kaggle/StoreSales/new_train.csv")
test = pd.read_csv("../../Data/Kaggle/StoreSales/new_test.csv")

In [3]:
train.head()

Unnamed: 0,store_nbr,family,sales,onpromotion,typeholiday,dcoilwtico,city,state,typestores,cluster,day_of_week,day,month,year
0,1,AUTOMOTIVE,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013
1,1,BABY CARE,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013
2,1,BEAUTY,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013
3,1,BEVERAGES,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013
4,1,BOOKS,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013


In [4]:
test.head()

Unnamed: 0,store_nbr,family,onpromotion,typeholiday,dcoilwtico,city,state,typestores,cluster,day_of_week,day,month,year
0,1,AUTOMOTIVE,0,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017
1,1,BABY CARE,0,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017
2,1,BEAUTY,2,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017
3,1,BEVERAGES,20,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017
4,1,BOOKS,0,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017


In [5]:
with open('pickle/family_encoder.pkl', 'rb') as file:
    family_encoder = pickle.load(file)

with open('pickle/typeholiday_encoder.pkl', 'rb') as file:
    typeholiday_encoder = pickle.load(file)

with open('pickle/city_encoder.pkl', 'rb') as file:
    city_encoder = pickle.load(file)

with open('pickle/state_encoder.pkl', 'rb') as file:
    state_encoder = pickle.load(file)

with open('pickle/typestores_encoder.pkl', 'rb') as file:
    typestores_encoder = pickle.load(file)

In [6]:
train['family'] = family_encoder.transform(train['family'])
train['typeholiday'] = typeholiday_encoder.transform(train['typeholiday'])
train['city'] = city_encoder.transform(train['city'])
train['state'] = state_encoder.transform(train['state'])
train['typestores'] = typestores_encoder.transform(train['typestores'])

In [7]:
train.head()

Unnamed: 0,store_nbr,family,sales,onpromotion,typeholiday,dcoilwtico,city,state,typestores,cluster,day_of_week,day,month,year,family_encoded,typeholiday_encoded,city_encoded,state_encoded,typestores_encoded
0,1,AUTOMOTIVE,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013,0,3,18,12,3
1,1,BABY CARE,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013,1,3,18,12,3
2,1,BEAUTY,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013,2,3,18,12,3
3,1,BEVERAGES,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013,3,3,18,12,3
4,1,BOOKS,0.0,0,Holiday,93.14,Quito,Pichincha,D,13,2,1,1,2013,4,3,18,12,3


In [8]:
# for column in train.columns:
#     unique_values = train[column].unique()
#     print(f"Unique values for {column}:", unique_values, "\n")

In [9]:
train = train.drop(columns=['family', 'typeholiday', 'city', 'state', 'typestores'])

In [10]:
train.head()

Unnamed: 0,store_nbr,sales,onpromotion,dcoilwtico,cluster,day_of_week,day,month,year,family_encoded,typeholiday_encoded,city_encoded,state_encoded,typestores_encoded
0,1,0.0,0,93.14,13,2,1,1,2013,0,3,18,12,3
1,1,0.0,0,93.14,13,2,1,1,2013,1,3,18,12,3
2,1,0.0,0,93.14,13,2,1,1,2013,2,3,18,12,3
3,1,0.0,0,93.14,13,2,1,1,2013,3,3,18,12,3
4,1,0.0,0,93.14,13,2,1,1,2013,4,3,18,12,3


In [11]:
features = train.drop('sales', axis=1)
target = train['sales']

In [12]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

In [13]:
sequence_length = 10

In [14]:
sequences = []
target_values = []

In [15]:
for i in range(len(scaled_features) - sequence_length):
    seq = scaled_features[i:i+sequence_length]
    label = target.iloc[i+sequence_length]
    sequences.append(seq)
    target_values.append(label)

In [16]:
sequences = np.array(sequences)
target_values = np.array(target_values)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(sequences, target_values, test_size=0.2, random_state=42)

In [18]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])

In [19]:
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')

In [20]:
model.fit(X_train, y_train, epochs=50, batch_size=5000, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x16795add0>

In [21]:
loss = model.evaluate(X_test, y_test)
print(f'Mean Squared Error on Test Set: {loss}')

Mean Squared Error on Test Set: 827810.375


In [22]:
y_pred = model.predict(X_test)



In [25]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [26]:
print(f'RMSE: {rmse}')

RMSE: 909.8411243007067


In [27]:
# from joblib import dump, load

# dump(model, 'joblib/M11.joblib')

['joblib/M11.joblib']

In [28]:
test['family_encoded'] = family_encoder.transform(test['family'])
test['typeholiday_encoded'] = typeholiday_encoder.transform(test['typeholiday'])
test['city_encoded'] = city_encoder.transform(test['city'])
test['state_encoded'] = state_encoder.transform(test['state'])
test['typestores_encoded'] = typestores_encoder.transform(test['typestores'])

In [29]:
test.head()

Unnamed: 0,store_nbr,family,onpromotion,typeholiday,dcoilwtico,city,state,typestores,cluster,day_of_week,day,month,year,family_encoded,typeholiday_encoded,city_encoded,state_encoded,typestores_encoded
0,1,AUTOMOTIVE,0,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017,0,4,18,12,3
1,1,BABY CARE,0,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017,1,4,18,12,3
2,1,BEAUTY,2,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017,2,4,18,12,3
3,1,BEVERAGES,20,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017,3,4,18,12,3
4,1,BOOKS,0,NDay,46.8,Quito,Pichincha,D,13,3,16,8,2017,4,4,18,12,3


In [30]:
test = test.drop(columns=['family', 'typeholiday', 'city', 'state', 'typestores'])

In [31]:
test.head()

Unnamed: 0,store_nbr,onpromotion,dcoilwtico,cluster,day_of_week,day,month,year,family_encoded,typeholiday_encoded,city_encoded,state_encoded,typestores_encoded
0,1,0,46.8,13,3,16,8,2017,0,4,18,12,3
1,1,0,46.8,13,3,16,8,2017,1,4,18,12,3
2,1,2,46.8,13,3,16,8,2017,2,4,18,12,3
3,1,20,46.8,13,3,16,8,2017,3,4,18,12,3
4,1,0,46.8,13,3,16,8,2017,4,4,18,12,3


In [32]:
test_features = scaler.transform(test)

In [33]:
test_sequences = []

In [34]:
for i in range(len(test_features) - sequence_length):
    seq = test_features[i:i+sequence_length]
    test_sequences.append(seq)

In [35]:
test_sequences = np.array(test_sequences)

In [36]:
test_predictions = model.predict(test_sequences)



In [37]:
print(test_predictions)

[[ 161.78653  ]
 [ 158.57321  ]
 [1105.7389   ]
 ...
 [1090.6938   ]
 [  -2.1627312]
 [  15.950512 ]]
