In [31]:
import lzma
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
import pandas as pd
from pathlib import Path
from collections import Counter

In [32]:
def load_train_data(path):
    with lzma.open(path, mode='rt', encoding='utf-8') as f:
        df = pd.read_csv(f, sep='\t', header=None, names=["tag", "sentence"])
    return list(zip(df["sentence"], df["tag"]))

def load_eval_data(in_path, expected_path):
    sentences = Path(in_path).read_text(encoding='utf-8').splitlines()
    tags = Path(expected_path).read_text(encoding='utf-8').splitlines()
    return list(zip(sentences, tags))

train = load_train_data("train/train.tsv.xz")
val = load_eval_data("dev-0/in.tsv", "dev-0/expected.tsv")

In [33]:
def tokenize_and_align(data):
    output = []
    for sentence, tag_seq in data:
        words = sentence.strip().split()
        tags = tag_seq.strip().split()
        if len(words) != len(tags):
            continue
        output.append(list(zip(words, tags)))
    return output

train_data = tokenize_and_align(train)
val_data = tokenize_and_align(val)

print(f"\nTraning set: {len(train_data)} examples")
print(f"Validation set: {len(val_data)} examples")


Traning set: 945 examples
Validation set: 215 examples


In [34]:
def build_vocab(dataset):
    word_counter = Counter()
    tag_counter = Counter()

    for sentence in dataset:
        for word, tag in sentence:
            word_counter[word.lower()] += 1
            tag_counter[tag] += 1

    word2idx = {w: i+2 for i, (w, _) in enumerate(word_counter.most_common())}
    word2idx["<PAD>"] = 0
    word2idx["<UNK>"] = 1

    tag2idx = {t: i for i, t in enumerate(tag_counter)}
    idx2tag = {i: t for t, i in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

word2idx, tag2idx, idx2tag = build_vocab(train_data)

In [35]:
class NERDataset(Dataset):
    def __init__(self, data, word2idx, tag2idx, max_len=50):
        self.data = data
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        words = [w.lower() for w, _ in sentence]
        tags = [t for _, t in sentence]

        x = [self.word2idx.get(w, self.word2idx["<UNK>"]) for w in words]
        y = [self.tag2idx[t] for t in tags]

        pad_len = self.max_len - len(x)
        x += [self.word2idx["<PAD>"]] * pad_len
        y += [-100] * pad_len  # niech ktos w pytorch to jako wbudowana metode zrobi bo przesada
        return torch.tensor(x[:self.max_len]), torch.tensor(y[:self.max_len])

In [36]:
train_dataset = NERDataset(train_data, word2idx, tag2idx)
val_dataset = NERDataset(val_data, word2idx, tag2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [37]:
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, emb_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim // 2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        out = self.fc(lstm_out)
        return out

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTM_NER(len(word2idx), len(tag2idx)).to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [39]:
epoch_number = 20

for epoch in range(epoch_number):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x_batch)
        outputs = outputs.view(-1, outputs.shape[-1])
        y_batch = y_batch.view(-1)
        loss = loss_fn(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

Epoch 1 | Loss: 46.2019
Epoch 2 | Loss: 25.6757
Epoch 3 | Loss: 23.1771
Epoch 4 | Loss: 21.2837
Epoch 5 | Loss: 19.3272
Epoch 6 | Loss: 17.4752
Epoch 7 | Loss: 15.6517
Epoch 8 | Loss: 13.9306
Epoch 9 | Loss: 12.2374
Epoch 10 | Loss: 10.6850
Epoch 11 | Loss: 9.2959
Epoch 12 | Loss: 8.0934
Epoch 13 | Loss: 7.0023
Epoch 14 | Loss: 6.0746
Epoch 15 | Loss: 5.2888
Epoch 16 | Loss: 4.5763
Epoch 17 | Loss: 3.9698
Epoch 18 | Loss: 3.4438
Epoch 19 | Loss: 2.9841
Epoch 20 | Loss: 2.5669


In [40]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for x_batch, y_batch in val_loader:
        x_batch = x_batch.to(device)
        outputs = model(x_batch)
        preds = torch.argmax(outputs, dim=-1).cpu().numpy()
        labels = y_batch.numpy()

        for pred_seq, label_seq in zip(preds, labels):
            for p, l in zip(pred_seq, label_seq):
                if l != -100:
                    all_preds.append(p)
                    all_labels.append(l)

In [41]:
f1 = f1_score(all_labels, all_preds, average="macro")
print(f"Validation F1 (macro): {f1:.4f}")

Validation F1 (macro): 0.6243
