In [1]:
import torch
import torch.nn as nn
import logging
import pandas as pd
from pathlib import Path
from fastai.tabular.core import add_datepart
from torch.utils.data import DataLoader, TensorDataset

from dataset import read_dataset
from bundling_content_based_filtering import encode_description

data_dir = Path("../data/data.csv")

logging.basicConfig(
    level=logging.INFO,  # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

data = read_dataset(path=data_dir)
data = data.drop(["Quantity", "InvoiceNo", "Country", "CustomerID"], axis=1)
data["InvoiceDate"] = pd.to_datetime(data["InvoiceDate"])
data["Description"] = data["Description"].fillna("")
# data = add_datepart(data, "InvoiceDate")
data = encode_description(data)

bool_columns = data.columns[data.dtypes == "bool"]
data[bool_columns] = data[bool_columns].astype(int)

2023-10-04 15:29:59,603 - root - INFO - Encoding textual information
100%|██████████| 3891/3891 [01:28<00:00, 44.15it/s]


In [7]:
X = torch.tensor(data.iloc[:, 3:].values, dtype=torch.float32)
y = torch.tensor(data['UnitPrice'].values, dtype=torch.float32)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64)

In [9]:
class RegressionModel(nn.Module):
    def __init__(self, input_size):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)

In [16]:
from tqdm import tqdm


model = RegressionModel(input_size=X.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

num_epochs = 50
for epoch in tqdm(range(num_epochs)):
    model.train()
    for batch in train_dataloader:
        X_batch, y_batch = batch
        y_pred = model(X_batch)
        loss = criterion(y_pred.squeeze(), y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

100%|██████████| 50/50 [00:12<00:00,  4.07it/s]


In [17]:
model.eval()
with torch.no_grad():
    test_loss = 0.0
    for batch in test_dataloader:
        X_batch, y_batch = batch
        y_pred = model(X_batch)
        test_loss += criterion(y_pred.squeeze(), y_batch)

print(f"Test MSE Loss: {test_loss / len(test_dataloader):.4f}")

Test MSE Loss: 0.3961
