In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import time
import json

# Ustawienie urządzenia: GPU, jeśli jest dostępne, w przeciwnym razie CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Dataset korzystający jedynie z danych tabelarycznych
class TabularOnlyDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = pd.DataFrame(json.load(f))

        # One-Hot Encoding dla zmiennych kategorycznych
        self.data = pd.get_dummies(self.data, columns=['fuel_type', 'gearbox', 'model', 'car_type_main'], drop_first=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tabular_data = self.data.drop(columns=['price', 'img_local']).iloc[idx].values.astype(float)
        price = self.data.iloc[idx]['price']
        return {
            'tabular': torch.tensor(tabular_data, dtype=torch.float32),
            'price': torch.tensor(price, dtype=torch.float32)
        }

# Model korzystający jedynie z danych tabelarycznych (MLP)
class TabularOnlyModel(nn.Module):
    def __init__(self, tabular_input_dim):
        super(TabularOnlyModel, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(tabular_input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, tabular):
        return self.mlp(tabular)

# Dane treningowe i testowe
train_json = 'train_data_with_car_type.json'
test_json = 'test_data_with_car_type.json'

train_dataset = TabularOnlyDataset(json_file=train_json)
test_dataset = TabularOnlyDataset(json_file=test_json)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

tabular_input_dim = train_dataset[0]['tabular'].shape[0]

# Inicjalizacja modelu
model = TabularOnlyModel(tabular_input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Trenowanie modelu
num_epochs = 30
for epoch in range(num_epochs):
    start_time = time.time()

    model.train()
    running_loss = 0.0
    for batch in train_dataloader:
        tabular_data = batch['tabular'].to(device)
        prices = batch['price'].to(device)

        optimizer.zero_grad()
        outputs = model(tabular_data).squeeze()
        loss = criterion(outputs, prices)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_dataloader)
    epoch_duration = time.time() - start_time
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Duration: {epoch_duration:.2f} seconds")

# Ewaluacja modelu
def evaluate_tabular_model(model, dataloader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            tabular_data = batch['tabular'].to(device)
            prices = batch['price'].to(device)
            
            outputs = model(tabular_data).squeeze()
            loss = criterion(outputs, prices)
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Tabular Model Test Loss (MSE): {avg_loss:.4f}")
    return avg_loss



In [None]:
evaluate_tabular_model(model, test_dataloader)