In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./dataset_pre_processed.csv", index_col=0)
df

Unnamed: 0_level_0,time_to_delivery_from_shipment,time_to_delivery_from_order_approved,time_to_delivery_from_order_purchased,freight_value,shipment_distance
order_purchase_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-05 11:56:06,5.021111,6.044306,6.054155,8.72,11.609337
2017-01-05 12:01:20,5.969502,10.502326,12.151134,8.72,11.609337
2017-01-05 12:06:36,5.020463,9.513889,11.165810,8.72,13.076936
2017-01-05 12:09:08,5.005278,9.499549,11.148854,8.72,13.076936
2017-01-05 12:11:23,5.988623,10.515544,12.163275,8.72,13.076936
...,...,...,...,...,...
2018-08-29 12:25:59,1.382257,1.425810,1.432269,9.20,18.854775
2018-08-29 14:18:23,1.038831,1.079028,1.087870,16.43,10.343719
2018-08-29 14:18:28,0.917720,1.098704,1.106979,15.39,431.976676
2018-08-29 14:52:00,0.858322,1.063623,1.072905,8.33,10.312870


In [3]:
!pip install scikit-learn



In [4]:
from sklearn.preprocessing import StandardScaler
import math

In [5]:
def split_train_test(data, test_size=0.2):
    train_size = 1 - test_size
    cohort = int(math.ceil(data.shape[0] * train_size))
    train = data[0:cohort]
    test =  data[cohort:]
    return train, test

In [6]:
data = df[["time_to_delivery_from_order_purchased", "freight_value", "shipment_distance"]]

train, test = split_train_test(data)
print(train.shape)
print(test.shape)

(73980, 3)
(18495, 3)


In [7]:
def scale_data(data):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data, scaler

In [8]:
X_train_scaled, X_train_scaler = scale_data(train)
X_train_y_scaled, X_train_y_scaler = scale_data(train[["time_to_delivery_from_order_purchased"]])
print(X_train_scaled.shape)
print(X_train_y_scaled.shape)


(73980, 3)
(73980, 1)


In [9]:
hops = 100
n_records = X_train_scaled.shape[0]
n_cols = X_train_scaled.shape[1]
X_train = []
y_train = []

for i in range(hops, n_records):
    X_train.append(X_train_scaled[i -hops: i])
    y_train.append(X_train_y_scaled[i][0])

X_train, y_train = np.array(X_train), np.array(y_train)

In [10]:
print(X_train.shape)
print(y_train.shape)

(73880, 100, 3)
(73880,)


In [11]:
X_train_shape = X_train.shape
X_train_shape

(73880, 100, 3)

In [12]:
#!pip install -U tensorflow
!pip install attention

Collecting attention
  Downloading attention-5.0.0-py3-none-any.whl (9.0 kB)
Installing collected packages: attention
Successfully installed attention-5.0.0


In [13]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping
from attention import Attention

In [14]:
model = Sequential()
model.add(LSTM(units=64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Attention(units=32))
model.add(Dense(1))
model.compile(optimizer="adam", loss="mean_squared_error")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 100, 64)           17408     
                                                                 
 attention (Attention)       (None, 32)                8192      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 25633 (100.13 KB)
Trainable params: 25633 (100.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
callback = EarlyStopping(monitor='loss', patience=3)
model.fit(X_train, y_train, epochs=1000, batch_size=32, workers=8, use_multiprocessing=True, callbacks=[callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.src.callbacks.History at 0x1bf506d4810>

In [16]:
df_full = pd.concat((train.iloc[-hops:], test), axis=0)
print(df_full.shape)
df_full.head()

(18595, 3)


Unnamed: 0_level_0,time_to_delivery_from_order_purchased,freight_value,shipment_distance
order_purchase_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-05-26 10:08:31,9.385498,16.43,81.794361
2018-05-26 10:10:47,10.244155,16.43,433.987729
2018-05-26 10:35:01,19.546725,16.43,1341.069643
2018-05-26 10:54:41,18.238958,9.77,646.201
2018-05-26 11:33:28,9.163472,7.39,74.111753


In [17]:
df_full = X_train_scaler.transform(df_full)
df_full.shape

(18595, 3)

In [18]:
hops = 100
n_records = df_full.shape[0]
n_cols = df_full.shape[1]
X_test = np.array([df_full[i - hops: i] for i in range(hops, n_records)])
print(X_test.shape)

(18495, 100, 3)


In [19]:
y_pred = model.predict(X_test)
y_pred




array([[ 1.1449125 ],
       [-0.1370905 ],
       [ 0.20610252],
       ...,
       [-0.2165668 ],
       [-0.42714363],
       [-0.39366072]], dtype=float32)

In [20]:
y_pred = X_train_y_scaler.inverse_transform(y_pred)
y_pred

array([[18.586195],
       [10.851468],
       [12.92206 ],
       ...,
       [10.371963],
       [ 9.101486],
       [ 9.303499]], dtype=float32)

In [21]:
y_pred.shape

(18495, 1)

In [22]:
df_preds = pd.DataFrame({"preds": y_pred.reshape((-1,))})
df_preds

Unnamed: 0,preds
0,18.586195
1,10.851468
2,12.922060
3,12.723883
4,15.003242
...,...
18490,10.478419
18491,10.299925
18492,10.371963
18493,9.101486


In [23]:
test_final = pd.concat((test.reset_index(drop=True), df_preds), axis=1)
test_final

Unnamed: 0,time_to_delivery_from_order_purchased,freight_value,shipment_distance,preds
0,19.087535,16.43,433.987729,18.586195
1,17.323530,27.76,807.707149,10.851468
2,6.146944,12.84,320.865049,12.922060
3,10.943565,23.50,602.004596,12.723883
4,11.407627,16.43,209.575788,15.003242
...,...,...,...,...
18490,1.432269,9.20,18.854775,10.478419
18491,1.087870,16.43,10.343719,10.299925
18492,1.106979,15.39,431.976676,10.371963
18493,1.072905,8.33,10.312870,9.101486


In [24]:
y_true = test_final["time_to_delivery_from_order_purchased"]
y_pred = test_final["preds"]

In [25]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
mape = mean_absolute_percentage_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
print(f"MAPE: {mape}")
print(f"MSE: {mse}")
print(f"MAE: {mae}")

MAPE: 0.8200001066094984
MSE: 32.12832605846582
MAE: 4.30905814086687


In [26]:
# import plotly.express as px
# fig = px.line(test_final, x=test_final["time_to_delivery_from_order_purchased"], y=test_final["time_to_delivery_from_order_purchased"])
# fig.add_scatter(x=test_final["time_to_delivery_from_order_purchased"], y=test_final["pred"])
# fig.show()