In [86]:
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [64]:
RANDOM_SEED = 42
random_generator = torch.Generator().manual_seed(RANDOM_SEED)

In [65]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

device

'mps'

In [66]:
df = pd.read_parquet('../data/processed/1_train_processed.parquet')

df

Unnamed: 0,id,sales,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,...,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
0,0,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,438.133,0.000000,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054344,3000884,154.553,0.001350,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054345,3000885,2419.729,0.199730,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054346,3000886,121.000,0.010796,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3054348 entries, 0 to 3054347
Columns: 191 entries, id to month_11
dtypes: bool(186), float64(4), uint32(1)
memory usage: 646.7 MB


In [68]:
df = df.astype('float32')

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3054348 entries, 0 to 3054347
Columns: 191 entries, id to month_11
dtypes: float32(191)
memory usage: 2.2 GB


In [70]:
X = df.drop(columns=['id', 'sales'])
y = df['sales']

X.shape, y.shape

((3054348, 189), (3054348,))

In [71]:
X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32)
y_tensor = torch.tensor(y.to_numpy(), dtype=torch.float32)

X_tensor.shape, y_tensor.shape, X_tensor.device, y_tensor.device

(torch.Size([3054348, 189]),
 torch.Size([3054348]),
 device(type='cpu'),
 device(type='cpu'))

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=RANDOM_SEED)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([2443478, 189]),
 torch.Size([610870, 189]),
 torch.Size([2443478]),
 torch.Size([610870]))

In [73]:
X_train.shape[1]

189

In [74]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

BATCH_SIZE = 1280

train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, generator=random_generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, generator=random_generator)

train_dataloader, test_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x3d472b0e0>,
 <torch.utils.data.dataloader.DataLoader at 0x3d472af60>)

In [75]:
class SalesPredictor(nn.Module):
    def __init__(self):
        super(SalesPredictor, self).__init__()
        self.layer_stack = nn.Sequential(
            nn.Linear(X_train.shape[1], 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.layer_stack(x)

In [76]:
torch.manual_seed(RANDOM_SEED)
model = SalesPredictor()
model, next(model.parameters()).device

(SalesPredictor(
   (layer_stack): Sequential(
     (0): Linear(in_features=189, out_features=128, bias=True)
     (1): ReLU()
     (2): Dropout(p=0.3, inplace=False)
     (3): Linear(in_features=128, out_features=64, bias=True)
     (4): ReLU()
     (5): Dropout(p=0.3, inplace=False)
     (6): Linear(in_features=64, out_features=32, bias=True)
     (7): ReLU()
     (8): Linear(in_features=32, out_features=1, bias=True)
   )
 ),
 device(type='cpu'))

In [77]:
LEARNING_RATE = 0.001

In [78]:
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [79]:
for X, y in train_dataloader:
    y_pred = model(X).squeeze()
    print(f"{y_pred.shape = }, {y.shape = }")
    break

y_pred.shape = torch.Size([1280]), y.shape = torch.Size([1280])


In [83]:
torch.manual_seed(RANDOM_SEED)
EPOCHS = 40
model = model.to(device)

for epoch in range(EPOCHS):
    
    train_loss = 0  
    model.train()
    for X, y in train_dataloader:
        X, y = X.to(device), y.to(device)
        y_pred = model(X).squeeze()
        loss = loss_fn(y_pred, y)
        train_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= len(train_dataloader)

    test_loss = 0
    model.eval()
    with torch.inference_mode():
        for X, y in test_dataloader:
            X, y = X.to(device), y.to(device)
            test_pred = model(X).squeeze()
            test_loss += loss_fn(test_pred, y)
        test_loss /= len(test_dataloader)

    if (epoch+1) % 1 == 0:
        print(f"Epoch = {epoch+1}/{EPOCHS}, {train_loss = :.2f}, {test_loss = :.2f}")

Epoch = 1/40, train_loss = 92.41, test_loss = 77.27
Epoch = 2/40, train_loss = 91.80, test_loss = 77.12
Epoch = 3/40, train_loss = 91.64, test_loss = 76.15
Epoch = 4/40, train_loss = 91.35, test_loss = 75.33
Epoch = 5/40, train_loss = 90.91, test_loss = 75.51
Epoch = 6/40, train_loss = 90.79, test_loss = 74.83
Epoch = 7/40, train_loss = 90.58, test_loss = 74.78
Epoch = 8/40, train_loss = 90.31, test_loss = 74.36
Epoch = 9/40, train_loss = 90.09, test_loss = 74.39
Epoch = 10/40, train_loss = 89.70, test_loss = 73.89
Epoch = 11/40, train_loss = 89.71, test_loss = 73.37
Epoch = 12/40, train_loss = 89.40, test_loss = 74.42
Epoch = 13/40, train_loss = 89.22, test_loss = 74.81
Epoch = 14/40, train_loss = 89.03, test_loss = 73.62
Epoch = 15/40, train_loss = 88.86, test_loss = 73.14
Epoch = 16/40, train_loss = 88.94, test_loss = 73.22
Epoch = 17/40, train_loss = 88.56, test_loss = 72.98
Epoch = 18/40, train_loss = 88.45, test_loss = 73.09
Epoch = 19/40, train_loss = 88.32, test_loss = 73.44
Ep

In [85]:
torch.save(obj=model.state_dict(), f='../models/sales_predictor_model.pth')

In [121]:
df_test = pd.read_parquet('../data/processed/1_test_processed.parquet').astype('float32')

df_test

Unnamed: 0,id,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,...,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
0,3000888.0,0.000000,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3000889.0,0.000000,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000890.0,0.002699,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3000891.0,0.026991,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3000892.0,0.000000,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029395.0,0.001350,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28508,3029396.0,0.000000,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28509,3029397.0,0.001350,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28510,3029398.0,0.012146,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [122]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Columns: 190 entries, id to month_11
dtypes: float32(190)
memory usage: 20.7 MB


In [123]:
X_test_tensor = torch.tensor(df_test.drop(columns=['id']).to_numpy(), dtype=torch.float32)

X_test_tensor.shape, X_test_tensor.device

(torch.Size([28512, 189]), device(type='cpu'))

In [124]:
model.eval()
with torch.inference_mode():
    X_test_tensor = X_test_tensor.to(device)
    y_test_pred = model(X_test_tensor)

y_test_pred

tensor([[ 6.4211e+00],
        [-1.2671e-02],
        [ 6.4369e+00],
        ...,
        [ 9.8462e+02],
        [ 2.7503e+00],
        [ 1.4849e+01]], device='mps:0')

In [125]:
y_test_pred = y_test_pred.cpu().numpy()

y_test_pred

array([[ 6.4211259e+00],
       [-1.2670755e-02],
       [ 6.4369221e+00],
       ...,
       [ 9.8461615e+02],
       [ 2.7503381e+00],
       [ 1.4848846e+01]], dtype=float32)

In [126]:
df_test['sales'] = y_test_pred

df_test

Unnamed: 0,id,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,sales
0,3000888.0,0.000000,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.421126
1,3000889.0,0.000000,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.012671
2,3000890.0,0.002699,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.436922
3,3000891.0,0.026991,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2160.407715
4,3000892.0,0.000000,0.202538,0.244108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.015316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029395.0,0.001350,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,344.585358
28508,3029396.0,0.000000,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,62.938530
28509,3029397.0,0.001350,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,984.616150
28510,3029398.0,0.012146,0.202538,0.249556,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.750338


In [127]:
df_test.sample(10, random_state=RANDOM_SEED)

Unnamed: 0,id,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,sales
12655,3013543.0,0.00135,0.202538,0.26365,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,14.983999
17039,3017927.0,0.0,0.202538,0.254175,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,174.559296
2281,3003169.0,0.0,0.202538,0.247305,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.017891
3378,3004266.0,0.037787,0.202538,0.247305,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4343.085449
10463,3011351.0,0.002699,0.202538,0.251096,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.021771
8442,3009330.0,0.0,0.202538,0.49291,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.708694
4405,3005293.0,0.0,0.202538,0.265309,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.840516
17604,3018492.0,0.00135,0.202538,0.254175,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.036439
14488,3015376.0,0.0,0.202538,0.249319,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.018293
16253,3017141.0,0.0,0.202538,0.254175,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.012409


In [128]:
df_test['sales'] = df_test['sales'].apply(lambda x: 0 if x < 0 else x)

df_test.sample(10, random_state=RANDOM_SEED)

Unnamed: 0,id,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,sales
12655,3013543.0,0.00135,0.202538,0.26365,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,14.983999
17039,3017927.0,0.0,0.202538,0.254175,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,174.559296
2281,3003169.0,0.0,0.202538,0.247305,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3378,3004266.0,0.037787,0.202538,0.247305,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4343.085449
10463,3011351.0,0.002699,0.202538,0.251096,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8442,3009330.0,0.0,0.202538,0.49291,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.708694
4405,3005293.0,0.0,0.202538,0.265309,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.840516
17604,3018492.0,0.00135,0.202538,0.254175,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
14488,3015376.0,0.0,0.202538,0.249319,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
16253,3017141.0,0.0,0.202538,0.254175,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [129]:
df_test['sales'].min(), df_test['sales'].max()

(np.float64(0.0), np.float64(11135.0498046875))

In [130]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Columns: 191 entries, id to sales
dtypes: float32(190), float64(1)
memory usage: 20.9 MB


In [131]:
df_submission = df_test[['id', 'sales']].copy()

df_submission['sales'] = df_submission['sales'].astype('uint16')
df_submission['id'] = df_submission['id'].astype('uint32')

df_submission

Unnamed: 0,id,sales
0,3000888,6
1,3000889,0
2,3000890,6
3,3000891,2160
4,3000892,0
...,...,...
28507,3029395,344
28508,3029396,62
28509,3029397,984
28510,3029398,2


In [132]:
df_submission.sample(10, random_state=RANDOM_SEED)

Unnamed: 0,id,sales
12655,3013543,14
17039,3017927,174
2281,3003169,0
3378,3004266,4343
10463,3011351,0
8442,3009330,0
4405,3005293,7
17604,3018492,0
14488,3015376,0
16253,3017141,0


In [133]:
df_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      28512 non-null  uint32
 1   sales   28512 non-null  uint16
dtypes: uint16(1), uint32(1)
memory usage: 167.2 KB


In [134]:
df_submission.to_csv('../data/processed/submission_3.csv', index=False)