In [22]:
import torch
import torch.nn as nn
import configs.common as cc
import torch.nn.functional as F


class MultiLabelSequenceClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        # x: [batch_size, seq_len] - token IDs
        embeds = self.embedding(x)                    # [batch_size, seq_len, embed_dim]
        out, _ = self.lstm(embeds)                    # [batch_size, seq_len, hidden_dim]
        last_hidden = out[:, -1, :]                   # [batch_size, hidden_dim]
        logits = self.fc(last_hidden)                 # [batch_size, num_classes]
        probs = torch.sigmoid(logits)
        probs = F.softmax(probs).to('cuda')                 # [batch_size, num_classes], probabilities per class
        return probs

# Usage Example
vocab_size = cc.vocab_size
embed_dim = 512
hidden_dim = 512
num_classes = cc.metadata_vocab_size  # Set to your number of possible labels

model = MultiLabelSequenceClassifier(vocab_size, embed_dim, hidden_dim, num_classes).to("cuda")
criterion = nn.BCELoss()  # or nn.BCEWithLogitsLoss (if you remove the sigmoid in the forward method)

def get_set(tensor):
    return [torch.unique(row) for row in tensor]

def make_meta_target(tensor):
    target = torch.zeros(cc.metadata_vocab_size)
    target[tensor] = 1
    return target

def get_all_targets(tensor):
    meta_unique = get_set(tensor)
    return torch.stack([make_meta_target(m) for m in meta_unique])

In [2]:
import sys
sys.path.append('..')
import processing
import configs.common as cc

loader = processing.DatasetLoader('E:/GitHub/dataset/np_dataset')
train_dataloader, test_dataloader = loader.get_dataloaders()
# random_sample = loader.get_random_sample('train')
# random_sample
for src, trg, meta in train_dataloader:
    break
# src, trg, meta = random_sample

In [25]:
import configs.paths as paths
from datetime import datetime
import os
import json

def save_model(model, loss):
    now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    pretrained_path = paths.config.paths.pretrained

    save_path = f'{pretrained_path}/classifier/loss_{loss:.2f}_time_{now}.pth'

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    torch.save(model.state_dict(), save_path)


dataset_path = paths.config.paths.np_dataset
loader = processing.DatasetLoader(dataset_path)
train_dataloader, test_dataloader = loader.get_dataloaders()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=cc.config.values.learning_rate)

# Logging setup
log_data = []
log_file_path = f'training_log_classifier.json'

# Training loop
num_epochs = cc.config.values.epochs
print('Training started!')
log_data.append({'timestamp': str(datetime.now()), 'message': 'Training started!'})

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_idx, (src, trg, meta) in enumerate(train_dataloader):
        output = model(src).to('cuda')
        trg = get_all_targets(meta).to('cuda')

        loss = criterion(output, trg)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % cc.config.values.eval_interval == 0:
            msg = f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}'
            print(msg)
            log_data.append({'timestamp': str(datetime.now()), 'message': msg})

    avg_loss = total_loss / len(train_dataloader)
    msg = f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}'
    print(msg)
    log_data.append({'timestamp': str(datetime.now()), 'message': msg})

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, trg, meta in test_dataloader:
            output = model(src).to('cuda')
            trg = get_all_targets(meta).to('cuda')
            val_loss += criterion(output, trg).item()

    avg_val_loss = val_loss / len(test_dataloader)
    msg = f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_val_loss:.4f}'
    print(msg)
    log_data.append({'timestamp': str(datetime.now()), 'message': msg})

    if (epoch + 1) % cc.config.values.save_interval == 0:
        save_model(model, avg_val_loss)
        with open(log_file_path, 'w') as f:
            json.dump(log_data, f, indent=2)

print("Training complete!")
log_data.append({'timestamp': str(datetime.now()), 'message': 'Training complete!'})

save_model(model, avg_val_loss)

# Final log save
with open(log_file_path, 'w') as f:
    json.dump(log_data, f, indent=2)

Training started!


  probs = F.softmax(probs).to('cuda')                 # [batch_size, num_classes], probabilities per class


Epoch [1/1000], Step [10/362], Loss: 26.4427
Epoch [1/1000], Step [20/362], Loss: 25.1842
Epoch [1/1000], Step [30/362], Loss: 27.7017
Epoch [1/1000], Step [40/362], Loss: 26.4419
Epoch [1/1000], Step [50/362], Loss: 27.7016
Epoch [1/1000], Step [60/362], Loss: 27.0702
Epoch [1/1000], Step [70/362], Loss: 25.1801
Epoch [1/1000], Step [80/362], Loss: 27.0653
Epoch [1/1000], Step [90/362], Loss: 27.0577
Epoch [1/1000], Step [100/362], Loss: 25.1678
Epoch [1/1000], Step [110/362], Loss: 26.4242
Epoch [1/1000], Step [120/362], Loss: 25.7920
Epoch [1/1000], Step [130/362], Loss: 25.7904
Epoch [1/1000], Step [140/362], Loss: 25.1573
Epoch [1/1000], Step [150/362], Loss: 26.4168
Epoch [1/1000], Step [160/362], Loss: 26.4175
Epoch [1/1000], Step [170/362], Loss: 26.4197
Epoch [1/1000], Step [180/362], Loss: 27.0444
Epoch [1/1000], Step [190/362], Loss: 25.7925
Epoch [1/1000], Step [200/362], Loss: 26.4129
Epoch [1/1000], Step [210/362], Loss: 25.1540
Epoch [1/1000], Step [220/362], Loss: 26.41

KeyboardInterrupt: 

In [2]:
import argparse
import torch
import configs.mamba as cm
import configs.xlstm as cx
import configs.transformer as ct
import configs.common as cc
import models
import math
import processing
from types import SimpleNamespace
import os
from datetime import datetime
import configs.paths as paths
import torch.nn.functional as F
import torch
import json
def get_transformer_dict():
    config = ct.config.model_values
    config.vocab_size = cc.vocab_size
    config.metadata_vocab_size = cc.metadata_vocab_size
    config = SimpleNamespace(**vars(config), **vars(cc.config.values))
    return config
def new_model(type):
    if type == "mamba":
        model = models.mamba.Mamba(512, 12)
    elif type == "xlstm":
        from models.xlstm import xLSTM
        model = xLSTM()
    elif type == "transformer":
        transformer_dict = get_transformer_dict()
        model = models.transformer.Transformer(transformer_dict)
    return model

def load_model(type, name):
    model = new_model(type)
    pretrained = paths.config.paths.pretrained
    model.load_state_dict(torch.load(f'{pretrained}/{type}/{name}'))
    return model

In [5]:
model = new_model('transformer')
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")

Total parameters: 16,204,002


In [None]:
output = model(src, meta)
output.shape

tensor([   45, 16585, 16665,  ..., 16739, 16750, 16960])