In [36]:
import pandas as pd


df = pd.read_csv('feature_extracted_plate.csv')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,plate,date,price,price_Box-Cox,price_Yeo-Johnson,price_Quantile,price_log,plate_length,region,registration_code,...,series_part_2_YC,series_part_2_YE,series_part_2_YH,series_part_2_YK,series_part_2_YM,series_part_2_YO,series_part_2_YP,series_part_2_YT,series_part_2_YX,series_part_2_YY
0,X059CP797,2024-12-26 00:00:00,65000,-0.903094,-0.903096,-0.817902,11.082158,9,797,59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Y800MH790,2024-07-12 21:31:37,100000,-0.440378,-0.44038,-0.370902,11.512935,9,790,800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A212TX77,2024-04-18 00:00:00,290000,0.532677,0.532678,0.468203,12.57764,8,77,212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P001AY199,2025-01-03 00:27:15,680000,1.196486,1.163831,1.149742,13.42985,9,199,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,B400BB750,2022-04-09 00:00:00,50000,-1.20701,-1.207011,-1.184447,10.819798,9,750,400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
char_plate = ['A', 'B', 'C', 'E', 'H', 'K', 'M', 'O', 'P', 'T', 'X', 'Y']
char2idx = {char: idx for idx, char in enumerate(char_plate)}
char2idx

{'A': 0,
 'B': 1,
 'C': 2,
 'E': 3,
 'H': 4,
 'K': 5,
 'M': 6,
 'O': 7,
 'P': 8,
 'T': 9,
 'X': 10,
 'Y': 11}

# Utilize the autoencoder

In [6]:
import torch
import torch.nn as nn


class Autoencoder(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(3, 8),
            # nn.ReLU(),
            nn.Linear(8, 16),
            # nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            #nn.Linear(128, 256),
            #nn.ReLU(),
        )

        self.decoder = nn.Sequential(
            #nn.Linear(256, 128),
            #nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            # nn.ReLU(),
            nn.Linear(8, 3),
            # nn.ReLU(),  
        )

    def forward(self, x):
        out = self.encoder(x)
        return self.decoder(out)

autoencoder = Autoencoder()
# Best Model:
autoencoder.load_state_dict(torch.load("2_0.00014334209845401347.pth"))
autoencoder.eval()

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=3, out_features=8, bias=True)
    (1): Linear(in_features=8, out_features=16, bias=True)
    (2): Linear(in_features=16, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=128, bias=True)
    (7): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=16, bias=True)
    (5): ReLU()
    (6): Linear(in_features=16, out_features=8, bias=True)
    (7): Linear(in_features=8, out_features=3, bias=True)
  )
)

In [8]:
pretrained_encoder = autoencoder.encoder

for param in pretrained_encoder.parameters():
    param.requires_grad = False

In [296]:
import torch
import torch.nn as nn

class BaseModel(nn.Module):
    def __init__(self, encoder):
        super(BaseModel, self).__init__()

        # Our pretrained model:
        self.encoder = encoder

        self.ffn_encoder = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 1),
            nn.ReLU(),
        )

        self.ffn_base = nn.Sequential(
            nn.Linear(3, 8),
            nn.ReLU(),
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 1),
            #nn.ReLU(),
        )


    def forward(self, series, region, registration):
        with torch.no_grad():
            encoder_out = self.encoder(series)

        encoder_ffn_out = self.ffn_encoder(encoder_out)
    
        input_ffn_base = torch.cat((encoder_ffn_out, region, registration), dim=1)
        out = self.ffn_base(input_ffn_base)
        return out

In [298]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import KFold


class CarPlateDataset(Dataset):
    def __init__(self, data):
        self.data = data.loc[:,['series', 'region', 'registration_code', 'price']]

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        series = self.data.iloc[idx]['series']
        region = self.data.iloc[idx]['region']
        registration = self.data.iloc[idx]['registration_code']
        price = self.data.iloc[idx]['price']


        series_tensor = torch.tensor([char2idx[char] for char in series], dtype=torch.float32)
        region_tensor = torch.tensor([region], dtype=torch.float32)
        registration_tensor = torch.tensor([registration], dtype=torch.float32)
        price_tensor = torch.tensor([price], dtype=torch.float32)
        

        return series_tensor, region_tensor, registration_tensor, price_tensor

dataset = CarPlateDataset(df)
batch_size = 62
k_folds = 3

kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

In [300]:
class SMAPELoss(nn.Module):
    def __init__(self):
        super(SMAPELoss, self).__init__()

    def forward(self, y_pred, y_true):
        # Compute the denominator: (|y_true| + |y_pred|) / 2.0
        denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
        
        # Use torch.where to avoid division by zero:
        # When denominator is 0, we assign 0 to the difference.
        diff = torch.where(denominator == 0, 
                           torch.zeros_like(denominator),
                           torch.abs(y_true - y_pred) / denominator)
        
        # Calculate mean SMAPE and scale by 100
        loss = torch.mean(diff) * 100.0
        return loss

In [314]:
for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f'\n=== Fold {fold+1} ===')
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
        
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
        
    optimizer = optim.Adam(model.parameters(), lr=0.0005)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for series, region, registration, price in train_loader:
            targets = price  # price is in original scale

            # Forward pass: model outputs log(price) values
            outputs = model(series, region, registration)
            
            preds = torch.exp(outputs)
            
            # Now compute loss between original-scale predictions and targets
            loss = criterion(preds, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * series.size(0)

        train_loss /= len(train_subset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for series, region, registration, price in val_loader:
                targets = price
                log_preds = model(series, region, registration)
                preds = torch.exp(log_preds)
                
                loss = criterion(preds, targets)
                val_loss += loss.item() * series.size(0)
            
        val_loss /= len(val_subset)

        print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")



=== Fold 1 ===
Epoch [1/100] Train Loss: 154.3416, Val Loss: 134.1431
Epoch [2/100] Train Loss: 134.7025, Val Loss: 134.3810
Epoch [3/100] Train Loss: 133.3387, Val Loss: 134.5887
Epoch [4/100] Train Loss: 132.8149, Val Loss: 136.8652
Epoch [5/100] Train Loss: 133.2413, Val Loss: 132.3155
Epoch [6/100] Train Loss: 132.1683, Val Loss: 132.1595
Epoch [7/100] Train Loss: 131.6486, Val Loss: 130.5738
Epoch [8/100] Train Loss: 130.9788, Val Loss: 130.3078
Epoch [9/100] Train Loss: 130.5050, Val Loss: 131.6272
Epoch [10/100] Train Loss: 130.1069, Val Loss: 129.1567
Epoch [11/100] Train Loss: 129.3267, Val Loss: 127.8862
Epoch [12/100] Train Loss: 127.8502, Val Loss: 126.6739
Epoch [13/100] Train Loss: 126.3250, Val Loss: 145.7111
Epoch [14/100] Train Loss: 121.1242, Val Loss: 108.8968
Epoch [15/100] Train Loss: 91.5599, Val Loss: 71.7503
Epoch [16/100] Train Loss: 75.1386, Val Loss: 69.7966
Epoch [17/100] Train Loss: 72.9673, Val Loss: 71.8166
Epoch [18/100] Train Loss: 71.6896, Val Loss: 7

KeyboardInterrupt: 

In [310]:
import torch.optim as optim
from torch.utils.data import DataLoader, Subset

model = BaseModel(pretrained_encoder)

num_epochs = 100

criterion = SMAPELoss()

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f'\n=== Fold {fold+1} ===')
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
        
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
        
    # You can use your SMAPE loss or, if you want to compare, MSELoss
    # criterion = nn.MSELoss()  # Original loss
    
    optimizer = optim.Adam(model.parameters(), lr=0.0005)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for series, region, registration, price in train_loader:
            targets = price  # Your target values
            outputs = model(series, region, registration)

            loss = criterion(outputs, targets)  # Use SMAPE loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * series.size(0)

        train_loss /= len(train_subset)

        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for series, region, registration, price in val_loader:
                targets = price
                outputs = model(series, region, registration)

                loss = criterion(outputs, targets)
                val_loss += loss.item() * series.size(0)
            
        val_loss /= len(val_subset)

        print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")



=== Fold 1 ===
Epoch [1/100] Train Loss: 200.0000, Val Loss: 200.0000
Epoch [2/100] Train Loss: 200.0000, Val Loss: 200.0000


KeyboardInterrupt: 

In [199]:
torch.save(model.state_dict(), 'base_best.pth')

In [247]:
import pandas as pd


df_test = pd.read_csv('test.csv').drop('date', axis=1)

df_test['series'] = df_test['plate'].apply(lambda x: x[0] + x[4:6])
df_test['region'] = df_test['plate'].apply(lambda x: int(x[-2:]) if len(x) == 8 else int(x[-3:]))
df_test['registration_code'] = df_test['plate'].apply(lambda x: int(x[1:4]))

df_test.head()

Unnamed: 0,id,plate,price,series,region,registration_code
0,51636,P700TT790,,PTT,790,700
1,51637,M081TX797,,MTX,797,81
2,51638,T333HX777,,THX,777,333
3,51639,H744BH977,,HBH,977,744
4,51640,X066EM777,,XEM,777,66


In [251]:
class TestCarPlateDataset(Dataset):
    def __init__(self, data):
        self.data = data.loc[:, ['series', 'region', 'registration_code']]
        self.original_indices = data.index  # If you want to keep original indices/IDs

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        series_tensor = torch.tensor([char2idx[char] for char in row['series']], dtype=torch.float32)
        region_tensor = torch.tensor([row['region']], dtype=torch.float32)
        registration_tensor = torch.tensor([row['registration_code']], dtype=torch.float32)

        return series_tensor, region_tensor, registration_tensor


test_dataset = TestCarPlateDataset(df_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [253]:
model.eval()
predictions = []

with torch.no_grad():
    for series, region, registration in test_loader:
        outputs = model(series, region, registration)
        predictions.append(outputs.cpu())

# Flatten predictions to numpy
preds = torch.cat(predictions).numpy().flatten()

In [255]:
preds

array([ 81451.48 ,  70880.766,  66767.32 , ...,  68909.445, 198048.75 ,
        74699.76 ], dtype=float32)

In [276]:
submission_df = pd.DataFrame({
    'id': df_test.id,
    'price': preds
})

submission_df

Unnamed: 0,id,price
0,51636,81451.476562
1,51637,70880.765625
2,51638,66767.320312
3,51639,76502.976562
4,51640,70673.804688
...,...,...
7690,59326,107165.117188
7691,59327,70551.203125
7692,59328,68909.445312
7693,59329,198048.750000


In [278]:
submission_df.to_csv('test_predictions.csv', index=False)