In [15]:
import pandas as pd


df = pd.read_csv('feature_extracted_plate.csv')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,plate,date,price,price_Box-Cox,price_Yeo-Johnson,price_Quantile,price_log,plate_length,region,registration_code,...,series_part_2_YC,series_part_2_YE,series_part_2_YH,series_part_2_YK,series_part_2_YM,series_part_2_YO,series_part_2_YP,series_part_2_YT,series_part_2_YX,series_part_2_YY
0,X059CP797,2024-12-26 00:00:00,65000,-0.903094,-0.903096,-0.817902,11.082158,9,797,59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Y800MH790,2024-07-12 21:31:37,100000,-0.440378,-0.44038,-0.370902,11.512935,9,790,800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A212TX77,2024-04-18 00:00:00,290000,0.532677,0.532678,0.468203,12.57764,8,77,212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P001AY199,2025-01-03 00:27:15,680000,1.196486,1.163831,1.149742,13.42985,9,199,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,B400BB750,2022-04-09 00:00:00,50000,-1.20701,-1.207011,-1.184447,10.819798,9,750,400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
unique_series_sorted = sorted(df['series'].unique())
series2idx = {series: idx for idx, series in enumerate(unique_series_sorted)}

In [192]:
len(unique_series_sorted)

1728

In [17]:
df.columns[:48]

Index(['plate', 'date', 'price', 'price_Box-Cox', 'price_Yeo-Johnson',
       'price_Quantile', 'price_log', 'plate_length', 'region',
       'registration_code', 'series_part_1', 'series_part_2', 'series',
       'region_name', 'region_id_normal', 'region_id_Box-Cox',
       'region_id_Yeo-Johnson', 'region_id_Quantile', 'region_id_log',
       'region_avg_price', 'region_avg_Box-Cox', 'region_avg_Yeo-Johnson',
       'region_avg_Quantile', 'region_avg_log', 'digit_symmetry',
       'registration_symmetry', 'region_symmetry', 'digits_frequency',
       'region_frequency', 'registration_frequency', 'series_symmetry', 'year',
       'month', 'day', 'hour', 'day_of_week', 'is_prestigious_number',
       'is_prestigious_letter', 'series_part_1_A', 'series_part_1_B',
       'series_part_1_C', 'series_part_1_E', 'series_part_1_H',
       'series_part_1_K', 'series_part_1_M', 'series_part_1_O',
       'series_part_1_P', 'series_part_1_T'],
      dtype='object')

# Embedding layer training

In [388]:

class Autoencoder(nn.Module):
    def __init__(self, input_dim=1, encoder_dims=[8, 16, 32, 64, 128]):
        super(Autoencoder, self).__init__()

        # Build encoder
        encoder_layers = []
        prev_dim = input_dim
        for hidden_dim in encoder_dims:
            encoder_layers.append(nn.Linear(prev_dim, hidden_dim))
            encoder_layers.append(nn.ReLU())
            prev_dim = hidden_dim
        self.encoder = nn.Sequential(*encoder_layers)

        # Build decoder: mirror the encoder
        decoder_dims = encoder_dims[::-1] + [input_dim]  # [128, 64, ..., 1]
        decoder_layers = []
        prev_dim = decoder_dims[0]
        for hidden_dim in decoder_dims[1:]:
            decoder_layers.append(nn.Linear(prev_dim, hidden_dim))
            if hidden_dim != input_dim:  # no activation on the final output
                decoder_layers.append(nn.ReLU())
            prev_dim = hidden_dim
        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


In [398]:
import torch
from torch.utils.data import Dataset, DataLoader

class SeriesDataset(Dataset):
    def __init__(self, series_tokens, series2idx):
        self.data = [series2idx[token] for token in series_tokens]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        token_idx = self.data[idx]
        return torch.tensor([token_idx], dtype=torch.float32)  # shape [1]

dataset = SeriesDataset(unique_series_sorted, series2idx)

# Use DataLoader to batch data:
batch_size = 128
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Check an example batch:
for batch_inputs, batch_targets in data_loader:
    print("Batch inputs:", batch_inputs)
    print("Batch targets:", batch_targets)
    break  # just to check the first batch

Batch inputs: tensor([ 166,  239, 1140, 1581,  380, 1119, 1112,  514, 1661,  426,  128,  121,
         332, 1066,  858, 1318,  958, 1568,  647,  446,    1,  644,  703, 1598,
         587,  470,  775,  154, 1463,  107,  932,  655,  460,   27,  767, 1648,
        1240, 1658, 1002,  915,  897,  990,  264,  371, 1589, 1222, 1405, 1064,
        1185, 1212, 1169,  996,  520,  452,  627, 1106, 1221,  769,  795,  227,
        1408, 1652,  493, 1299])
Batch targets: tensor([ 166,  239, 1140, 1581,  380, 1119, 1112,  514, 1661,  426,  128,  121,
         332, 1066,  858, 1318,  958, 1568,  647,  446,    1,  644,  703, 1598,
         587,  470,  775,  154, 1463,  107,  932,  655,  460,   27,  767, 1648,
        1240, 1658, 1002,  915,  897,  990,  264,  371, 1589, 1222, 1405, 1064,
        1185, 1212, 1169,  996,  520,  452,  627, 1106, 1221,  769,  795,  227,
        1408, 1652,  493, 1299])


In [420]:
import torch.optim as optim

model = Autoencoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 500

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for inputs in dataloader:
        targets = inputs.clone()  # targets are the same as inputs

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

Epoch [1/500], Loss: 993880.3843
Epoch [2/500], Loss: 993088.2454
Epoch [3/500], Loss: 992132.8519
Epoch [4/500], Loss: 990948.2315
Epoch [5/500], Loss: 989200.0787
Epoch [6/500], Loss: 985941.0648
Epoch [7/500], Loss: 980126.9352
Epoch [8/500], Loss: 970489.3565
Epoch [9/500], Loss: 954254.2222
Epoch [10/500], Loss: 926420.2384
Epoch [11/500], Loss: 880802.3125
Epoch [12/500], Loss: 807578.2546
Epoch [13/500], Loss: 696668.2060
Epoch [14/500], Loss: 538160.8449
Epoch [15/500], Loss: 342801.1354
Epoch [16/500], Loss: 148452.6209
Epoch [17/500], Loss: 24664.6287
Epoch [18/500], Loss: 925.0712
Epoch [19/500], Loss: 1643.8867
Epoch [20/500], Loss: 108.8658
Epoch [21/500], Loss: 55.6894
Epoch [22/500], Loss: 14.7656
Epoch [23/500], Loss: 1.6954
Epoch [24/500], Loss: 1.2629
Epoch [25/500], Loss: 0.5797
Epoch [26/500], Loss: 0.4783
Epoch [27/500], Loss: 0.4526
Epoch [28/500], Loss: 0.4480
Epoch [29/500], Loss: 0.4366
Epoch [30/500], Loss: 0.4332
Epoch [31/500], Loss: 0.4398
Epoch [32/500], L

In [416]:
model.eval()
wrong_predictions = []

idx2series = {v: k for k, v in series2idx.items()}

with torch.no_grad():
    for token in unique_series_sorted:
        true_idx = series2idx[token]
        input_tensor = torch.tensor([[true_idx]], dtype=torch.float32)  # shape: [1, 1]

        output = model(input_tensor)
        predicted_idx = int(torch.round(output).item())  # round to nearest index

        # Clamp to valid index range
        predicted_idx = max(0, min(predicted_idx, len(idx2series) - 1))

        if predicted_idx != true_idx:
            predicted_token = idx2series[predicted_idx]
            wrong_predictions.append((token, predicted_token))

# Show results
print(f"Total wrong predictions: {len(wrong_predictions)} / {len(unique_series_sorted)}\n")
for true_token, predicted_token in wrong_predictions:
    print(f"{true_token} ➜ {predicted_token}")


Total wrong predictions: 0 / 1728



In [446]:

with torch.no_grad():
    input_tensor = torch.tensor([[series2idx['AAC']]], dtype=torch.float32)
    
    out = model(input_tensor)
    print(torch.round(out).item())

2.0


In [448]:
out

tensor([[1.9967]])