In [102]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split

In [103]:
RANDOM_SEED = 42
random_generator = torch.Generator().manual_seed(RANDOM_SEED)

In [104]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

device

'mps'

In [105]:
df = pd.read_parquet('../data/processed/1_train_processed.parquet')

df

Unnamed: 0,id,sales,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,...,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
0,0,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,438.133,0.000000,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054344,3000884,154.553,0.001350,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054345,3000885,2419.729,0.199730,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054346,3000886,121.000,0.010796,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [106]:
df['is_zero'] = (df.sales == 0).astype('float32')

df

Unnamed: 0,id,sales,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,is_zero
0,0,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1.0
1,1,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1.0
2,2,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1.0
3,3,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1.0
4,4,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,438.133,0.000000,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0.0
3054344,3000884,154.553,0.001350,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0.0
3054345,3000885,2419.729,0.199730,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0.0
3054346,3000886,121.000,0.010796,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0.0


In [107]:
df.shape

(3054348, 192)

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3054348 entries, 0 to 3054347
Columns: 192 entries, id to is_zero
dtypes: bool(186), float32(1), float64(4), uint32(1)
memory usage: 658.3 MB


In [109]:
# df = df.head(1_000_000)

In [110]:
df = df.astype('float32')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3054348 entries, 0 to 3054347
Columns: 192 entries, id to is_zero
dtypes: float32(192)
memory usage: 2.2 GB


In [111]:
X = df.drop(columns=['id', 'sales', 'is_zero'])
y_is_zero = df['is_zero']
y_sales = df['sales']

X.shape, y_is_zero.shape, y_sales.shape

((3054348, 189), (3054348,), (3054348,))

In [112]:
# X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32, device=device)
# y_is_zero_tensor = torch.tensor(y_is_zero.to_numpy(), dtype=torch.float32, device=device)
# y_sales_tensor = torch.tensor(y_sales.to_numpy(), dtype=torch.float32, device=device)

# X_tensor.shape, y_is_zero_tensor.shape, y_sales_tensor.shape

In [113]:
class SalesPredictor(nn.Module):
    def __init__(self):
        super(SalesPredictor, self).__init__()
        self.shared_layers = nn.Sequential(
            nn.Linear(X.shape[1], 128),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        # self.classification_layers = nn.Sequential(
        #     nn.Linear(64, 32),
        #     nn.ReLU(),
        #     nn.Linear(32, 1),
        #     nn.Sigmoid()
        # )
        self.regression_layers = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        shared_layers = self.shared_layers(x)
        # is_zero = self.classification_layers(shared_layers)
        sales = self.regression_layers(shared_layers)
        return sales

In [114]:
X_train, X_test, y_is_zero_train, y_is_zero_test, y_sales_train, y_sales_test = train_test_split(
    X, y_is_zero, y_sales, test_size=0.2, random_state=RANDOM_SEED
)

X_train.shape, X_test.shape, y_is_zero_train.shape, y_is_zero_test.shape, y_sales_train.shape, y_sales_test.shape

((2443478, 189), (610870, 189), (2443478,), (610870,), (2443478,), (610870,))

In [115]:
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32, device=device)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32, device=device)
y_is_zero_train_tensor = torch.tensor(y_is_zero_train.to_numpy(), dtype=torch.float32, device=device)
y_is_zero_test_tensor = torch.tensor(y_is_zero_test.to_numpy(), dtype=torch.float32, device=device)
y_sales_train_tensor = torch.tensor(y_sales_train.to_numpy(), dtype=torch.float32, device=device)
y_sales_test_tensor = torch.tensor(y_sales_test.to_numpy(), dtype=torch.float32, device=device)

print(f"""{X_train_tensor.shape = }, {X_test_tensor.shape = }, {y_is_zero_train_tensor.shape = }, 
      {y_is_zero_test_tensor.shape = }, {y_sales_train_tensor.shape = }, {y_sales_test_tensor.shape = }""")

X_train_tensor.shape = torch.Size([2443478, 189]), X_test_tensor.shape = torch.Size([610870, 189]), y_is_zero_train_tensor.shape = torch.Size([2443478]), 
      y_is_zero_test_tensor.shape = torch.Size([610870]), y_sales_train_tensor.shape = torch.Size([2443478]), y_sales_test_tensor.shape = torch.Size([610870])


In [116]:
# ds = TensorDataset(X_tensor, y_is_zero_tensor, y_sales_tensor)

# train_size = int(0.8 * len(ds))
# test_size = int(len(ds) - train_size)
# train_ds, test_ds = random_split(dataset=ds, lengths=[train_size, test_size], generator=random_generator)

# train_ds, test_ds

In [117]:
# BATCH_SIZE = 128

In [118]:
# train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, generator=random_generator)
# test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, generator=random_generator)

In [119]:
# f"number of batches = {X.shape[0] / BATCH_SIZE}"

In [120]:
torch.manual_seed(RANDOM_SEED)
model = SalesPredictor()

model.to(device)

model, model.state_dict(), next(model.parameters()).device

(SalesPredictor(
   (shared_layers): Sequential(
     (0): Linear(in_features=189, out_features=128, bias=True)
     (1): ReLU()
     (2): Dropout(p=0.3, inplace=False)
     (3): Linear(in_features=128, out_features=64, bias=True)
     (4): ReLU()
   )
   (regression_layers): Sequential(
     (0): Linear(in_features=64, out_features=32, bias=True)
     (1): ReLU()
     (2): Linear(in_features=32, out_features=1, bias=True)
   )
 ),
 OrderedDict([('shared_layers.0.weight',
               tensor([[ 0.0556,  0.0604, -0.0170,  ...,  0.0272, -0.0128, -0.0193],
                       [ 0.0078, -0.0128, -0.0217,  ..., -0.0351,  0.0132,  0.0396],
                       [ 0.0603, -0.0668,  0.0486,  ..., -0.0510, -0.0452, -0.0641],
                       ...,
                       [-0.0552, -0.0163,  0.0682,  ...,  0.0524,  0.0248, -0.0716],
                       [-0.0238, -0.0373, -0.0655,  ...,  0.0004, -0.0089, -0.0017],
                       [-0.0616, -0.0674, -0.0266,  ...,  0.0363,  0.0

In [121]:
LEARNING_RATE = 0.01

In [122]:
loss_clf_fn = nn.BCELoss()
loss_reg_fn = nn.L1Loss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [123]:
def accuracy_fn(true_labels, pred_labels):
    correct = torch.eq(true_labels, pred_labels).sum().item()
    acc = (correct/len(pred_labels)) * 100
    return acc

In [124]:
# is_zero_pred, sales_pred = model(X_train_tensor)
# is_zero_pred = torch.round(is_zero_pred).squeeze()

# is_zero_pred.shape, y_is_zero_train_tensor.shape

In [125]:
y_sales_train_tensor.shape

torch.Size([2443478])

In [126]:
# is_zero_pred, sales_pred = model(X_train_tensor)

# is_zero_pred[0]

In [127]:
# torch.round(is_zero_pred[0])

In [128]:
torch.manual_seed(RANDOM_SEED)
EPOCHS = 100

for epoch in range(EPOCHS):

    model.train()
    sales_pred = model(X_train_tensor)

    # is_zero_pred = is_zero_pred.squeeze()
    sales_pred = sales_pred.squeeze()
    # loss_clf = loss_clf_fn(is_zero_pred, y_is_zero_train_tensor)
    # acc = accuracy_fn(true_labels=y_is_zero_train_tensor, pred_labels=torch.round(is_zero_pred))

    # 'y_sales' where 'y_is_zero' == 0
    # 'sales_pred' with only the rows from 'y_sales'

    # mask = (y_is_zero_train_tensor == 0).cpu().numpy()

    # print(f"Shape of sales_pred: {sales_pred.shape}")
    # print(f"Shape of y_sales_train_tensor: {y_sales_train_tensor.shape}")
    # print(f"Shape of mask: {mask.shape}")

    # sales_pred = sales_pred[mask]
    # y_sales_train_tensor = y_sales_train_tensor[mask]

    loss = loss_reg_fn(sales_pred, y_sales_train_tensor)

    # loss = loss_clf + loss_reg

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # if (epoch + 1) % 10 == 0:
    #     print(f"{loss_clf = :.2f}, {acc = :.2f}, {loss_reg = :.2f}, {mape = :.2f}")

    if (epoch + 1) % 10 == 0:
        print(f"{loss = :.2f}")

loss = 355.65
loss = 337.10
loss = 285.77
loss = 229.60
loss = 169.29
loss = 142.67
loss = 133.67
loss = 128.13
loss = 124.38
loss = 121.36
