In [42]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

from torchinfo import summary

from data_model import create_dataloaders
from utils import accuracy, set_seeds
from LSTM_model import ArtikelLSTM

from Transformer_model import ArtikelTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"


# Label encoding for genders and plurality
gender_to_idx = {"masculine": 0, "feminine": 1, "neutral": 2}

# Der die das to idx:
artikel_to_idx = {"der": 0, "die": 1, "das": 2}

In [43]:
set_seeds(seed=42)

In [44]:
train_dataloader, test_dataloader = create_dataloaders(file_path="words_big.txt", data_fraction=0.2, test_size=0.2, batch_size=64)

Words with der: 22485, words with die: 28411, words with das: 14364


In [45]:
# Initialize the model
vocab_size = len(char_to_idx)
embedding_dim = 8
hidden_dim = 16
num_filters = 16
num_heads = 8
num_layers = 4
model = ArtikelLSTM(vocab_size, embedding_dim=32, hidden_dim=8, dropout=0.8, num_layers=1)
#model = ArtikelTransformer(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers).to(device)
model.to(device)

summary(model, input_size=(1, 10), dtypes=[torch.long], col_names=["input_size", "output_size", "num_params", "trainable"], col_width=20, row_settings=["var_names"])

Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
ArtikelLSTM (ArtikelLSTM)                [1, 10]              [1, 3]               --                   True
├─Embedding (embedding)                  [1, 10]              [1, 10, 32]          992                  True
├─LSTM (lstm)                            [1, 10, 32]          [1, 10, 16]          2,688                True
├─Sequential (indices_fc)                [1, 16]              [1, 3]               --                   True
│    └─Linear (0)                        [1, 16]              [1, 3]               51                   True
Total params: 3,731
Trainable params: 3,731
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.03
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.01
Estimated Total Size (MB): 0.02

In [46]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_losses = []
test_losses = []
test_accuracies = []

In [None]:
# Training loop
for epoch in range(5):  # Number of epochs
    train_loss = 0
    model.train()
    for words, indices in train_dataloader:
        optimizer.zero_grad()

        # Forward pass
        indices_output = model(words.to(device))

        # Calculate loss
        loss = F.cross_entropy(indices_output, indices.to(device))
        train_loss += loss

        # Combine the losses
        loss.backward()

        optimizer.step()

    train_loss /= len(train_dataloader)
    train_losses.append(train_loss.cpu().detach().numpy())

    test_loss = 0
    test_accuracy = 0
    with torch.inference_mode():
        for words, indices in test_dataloader:
            # Forward pass
            indices_output = model(words.to(device))
    
            # Calculate loss
            test_loss += F.cross_entropy(indices_output, indices.to(device))
            test_accuracy += accuracy(indices, indices_output.argmax(dim=1).cpu())

        test_loss /= len(test_dataloader)
        test_losses.append(test_loss.cpu())

        test_accuracy /= len(test_dataloader)
        test_accuracies.append(test_accuracy)


    print(f"Epoch {epoch+1}, Train_loss: {train_loss.item():.4f}, Test_loss: {test_loss.item():.4f}, Accuracy: {test_accuracy:.4f}")

In [None]:
plt.plot(test_losses, label="test")
plt.plot(train_losses, label="train")
plt.legend()
plt.show()

In [None]:
#plt.plot(train_accuracies, label="train")
plt.plot(test_accuracies, label="test")
plt.legend()
plt.show()

In [None]:
def predict(model, word, char_to_idx):
    model.eval()
    word_indices = [char_to_idx[char] for char in word.lower()]
    word_tensor = torch.tensor(word_indices, dtype=torch.long).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(word_tensor)

        idx = torch.argmax(output, dim=1).item()

    idx_to_artikel = {v: k for k, v in artikel_to_idx.items()}

    return idx_to_artikel[idx]

# Example usage
print(predict(model, "Popo", char_to_idx))