In [26]:
from deeplotx import SoftmaxRegression, LongformerEncoder
from util import NUM_CLASSES
lf_encoder = LongformerEncoder(model_name_or_path='severinsimmler/xlm-roberta-longformer-base-16384')

[DEBUG] 2025-08-03 13:47:19,404 deeplotx.embedding : LongformerEncoder initialized on device: cuda.


In [27]:
import torch
from deeplotx.util import sha256
from vortezwohl.cache import LRUCache

CACHE = LRUCache(capacity=16384)

def encode(text: str) -> torch.Tensor:
    key = sha256(text)
    if key in CACHE:
        return CACHE[key]
    emb = lf_encoder.encode(text, cls_only=False).mean(dim=-2, dtype=model.dtype)
    CACHE[key] = emb
    return emb

In [28]:
import json
from random import shuffle

with open('./data/datasets.json', 'r', encoding='utf-8') as f:
    datasets = json.load(f)
    
train_dataset = datasets['train']
valid_dataset = datasets['valid']
test_dataset = datasets['test']
    
shuffle(train_dataset)
shuffle(valid_dataset)
shuffle(test_dataset)
print('Dataset shuffled', list(zip(train_dataset[:23], train_dataset[:23])))

Dataset shuffled [(['Темпельхоф-Шёнеберг', 5], ['Темпельхоф-Шёнеберг', 5]), (['Муркока', 1], ['Муркока', 1]), (['Kuduro', 8], ['Kuduro', 8]), (['Ататюрка', 1], ['Ататюрка', 1]), (['лыжных', 0], ['лыжных', 0]), (['Рейхштадт', 5], ['Рейхштадт', 5]), (['HaRabbanim', 4], ['HaRabbanim', 4]), (['Pakistaans', 3], ['Pakistaans', 3]), (['Vriendschapsspelen', 7], ['Vriendschapsspelen', 7]), (['Juicy', 1], ['Juicy', 1]), (['afhakken', 0], ['afhakken', 0]), (['Erkin', 2], ['Erkin', 2]), (['Etolia', 5], ['Etolia', 5]), (['herrliche', 8], ['herrliche', 8]), (['Lobuche', 5], ['Lobuche', 5]), (['frames', 0], ['frames', 0]), (['Ionico', 6], ['Ionico', 6]), (['судом', 4], ['судом', 4]), (['заключила', 0], ['заключила', 0]), (['Echinochloa', 5], ['Echinochloa', 5]), (['Монтабер', 2], ['Монтабер', 2]), (['Ipatovo', 5], ['Ipatovo', 5]), (['Whittemore', 2], ['Whittemore', 2])]


## 训练

In [29]:
model = SoftmaxRegression(input_dim=768, output_dim=NUM_CLASSES, num_heads=4, num_layers=3, expansion_factor=1.25, bias=True, dropout_rate=0.2, head_layers=2)
print(model)

Model_Name: SoftmaxRegression
In_Features: 768
Out_Features: 9
Device: cuda
Dtype: torch.float32
Total_Parameters: 49642017
Trainable_Parameters: 49642017
NonTrainable_Parameters: 0
-------------------------------
SoftmaxRegression(
  (multi_head_ffn_layers): ModuleList(
    (0-2): 3 x MultiHeadFeedForward(
      (expand_proj): Linear(in_features=768, out_features=3072, bias=True)
      (ffn_heads): ModuleList(
        (0-3): 4 x FeedForward(
          (ffn_layers): ModuleList(
            (0-1): 2 x FeedForwardUnit(
              (up_proj): Linear(in_features=768, out_features=960, bias=True)
              (down_proj): Linear(in_features=960, out_features=768, bias=True)
              (parametric_relu): PReLU(num_parameters=1)
              (layer_norm): LayerNorm((768,), eps=1e-09, elementwise_affine=True)
            )
          )
        )
      )
      (out_proj): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
  (out_proj): Linear(in_features=768, out_features=9, 

In [30]:
# train
from torch import nn, optim
from torch.utils.tensorboard import SummaryWriter

In [31]:
train_step = 0
valid_step = 0
writer = SummaryWriter()

acc_train_loss = 0.
acc_valid_loss = 0.
eval_interval = 2000
log_interval = 200
valid_log_interval = 50

In [None]:
from random import randint
from util import one_hot

elastic_net_param = {
    'alpha': 2e-4,
    'rho': 0.2
}
learning_rate = 2e-6
num_epochs = 1500
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
    model.train()
    for i, (_token, _label) in enumerate(train_dataset):
        _one_hot_label = one_hot(_label).to(model.dtype).to(model.device)
        outputs = model.forward(encode(_token))
        loss = loss_function(outputs, _one_hot_label) + model.elastic_net(alpha=elastic_net_param['alpha'], rho=elastic_net_param['rho'])
        acc_train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if train_step % log_interval == 0 and train_step > 0:
            writer.add_scalar('train/loss', acc_train_loss / log_interval, train_step)
            print(f'- Train Step {train_step} Loss {acc_train_loss / log_interval} \\'
                  f'\nToken={_token}'
                  f'\nPred={outputs.tolist()}'
                  f'\nLabel={_one_hot_label.tolist()}', flush=True)
            acc_train_loss = 0.
        train_step += 1
        if train_step % eval_interval == 0:
            model.eval()
            rand_idx = randint(0, len(valid_dataset) - 501)
            with torch.no_grad():
                for _i, (__token, __label) in enumerate(valid_dataset[rand_idx: rand_idx + 500]):
                    _one_hot_label = one_hot(__label).to(model.dtype).to(model.device)
                    outputs = model.forward(encode(__token))
                    loss = loss_function(outputs, _one_hot_label)
                    acc_valid_loss += loss.item()
                    if valid_step % valid_log_interval == 0 and valid_step > 0:
                        writer.add_scalar('valid/loss', acc_valid_loss / valid_log_interval, valid_step)
                        print(f'- Valid Step {valid_step} Loss {acc_valid_loss / valid_log_interval} \\'
                              f'\nToken={__token}'
                              f'\nPred={outputs.tolist()}'
                              f'\nLabel={_one_hot_label.tolist()}', flush=True)
                        acc_valid_loss = 0.
                    valid_step += 1
            model.train()

- Train Step 200 Loss 37.698599624633786 \
Token=Gulda
Pred=[0.05101672187447548, 0.08279963582754135, 0.4351254999637604, 0.029791289940476418, 0.022936778143048286, 0.29855313897132874, 0.027337374165654182, 0.023303652182221413, 0.0291359294205904]
Label=[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
- Train Step 400 Loss 37.1756880569458 \
Token=Kutxa
Pred=[0.02882845140993595, 0.0431528277695179, 0.6448128819465637, 0.010775791481137276, 0.008547043427824974, 0.23365026712417603, 0.00929773785173893, 0.009288311935961246, 0.011646737344563007]
Label=[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
- Train Step 600 Loss 36.84374032974243 \
Token=Stapleton
Pred=[0.01820603385567665, 0.022840633988380432, 0.4889258146286011, 0.007055042777210474, 0.005878622177988291, 0.4381110668182373, 0.006484708748757839, 0.004986331798136234, 0.007511668838560581]
Label=[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
- Train Step 800 Loss 36.55880084991455 \
Token=Assen
Pred=[0.013761444948613644, 0.0

In [None]:
model.eval()
test_tokens = ['Mike', 'John', 'Smith', 'London', 'NYC', 'HongKong', 'China', 'South Africa', 'Korea']
with torch.no_grad():
    for _tok in test_tokens:
        _dist = model.forward(encode(_tok))
        print(f'Token={_tok}, Class={torch.argmax(_dist)}', flush=True)
model.train()
'Test finished.'