In [20]:
url = "https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/conll2002/esp.testa"

In [21]:
import requests

content = requests.get(url).text
print(content[:300])

Sao NC B-LOC
Paulo VMI I-LOC
( Fpa O
Brasil NC B-LOC
) Fpt O
, Fc O
23 Z O
may NC O
( Fpa O
EFECOM NP B-ORG
) Fpt O
. Fp O

- Fg O

La DA O
multinacional NC O
española AQ O
Telefónica AQ B-ORG
ha VAI O
impuesto VMP O
un DI O
récord NC O
mundial AQ O
al SP O
poner VMN O
en SP O
servicio NC O
tres DN 


In [22]:
import requests

url = "https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/conll2002/esp.testa"
content = requests.get(url).text

with open("esp.testa", "w", encoding="utf-8") as f:
    f.write(content)

print("Dosya kaydedildi!")

Dosya kaydedildi!


In [23]:
!pip install transformers==4.46.3 tokenizers==0.20.3 datasets seqeval accelerate -q

In [24]:
import requests

url_train = "https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/conll2002/esp.train"
url_val   = "https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/conll2002/esp.testa"
url_test  = "https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/conll2002/esp.testb"

raw_train = requests.get(url_train).text
raw_val   = requests.get(url_val).text
raw_test  = requests.get(url_test).text

print(raw_train[:300])

Melbourne NP B-LOC
( Fpa O
Australia NP B-LOC
) Fpt O
, Fc O
25 Z O
may NC O
( Fpa O
EFE NC B-ORG
) Fpt O
. Fp O

- Fg O

El DA O
Abogado NC B-PER
General AQ I-PER
del SP I-PER
Estado NC I-PER
, Fc O
Daryl VMI B-PER
Williams NC I-PER
, Fc O
subrayó VMI O
hoy RG O
la DA O
necesidad NC O
de SP O
tomar


In [25]:
def parse_conll(raw_text):
    sentences = []
    tokens = []
    labels = []

    for line in raw_text.splitlines():
        line = line.strip()

        if not line:
            if tokens:
                sentences.append((tokens, labels))
                tokens = []
                labels = []
            continue

        parts = line.split()
        token = parts[0]
        label = parts[-1]

        tokens.append(token)
        labels.append(label)

    if tokens:
        sentences.append((tokens, labels))

    return sentences

train_sentences = parse_conll(raw_train)
val_sentences   = parse_conll(raw_val)
test_sentences  = parse_conll(raw_test)

len(train_sentences), len(val_sentences), len(test_sentences)

(8323, 1915, 1517)

In [26]:
label_set = set()

for s in (train_sentences + val_sentences + test_sentences):
    label_set.update(s[1])

labels_list = sorted(list(label_set))
labels_list

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']

In [27]:
label2id = {label: idx for idx, label in enumerate(labels_list)}
id2label = {idx: label for label, idx in label2id.items()}

In [28]:
from transformers import AutoTokenizer

model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [29]:
def tokenize_and_align(tokens, labels):
    enc = tokenizer(tokens, is_split_into_words=True, truncation=True, return_offsets_mapping=True)
    word_ids = enc.word_ids()

    new_labels = []
    prev = None

    for w_id in word_ids:
        if w_id is None:
            new_labels.append(-100)
        elif w_id != prev:
            new_labels.append(label2id[labels[w_id]])
        else:
            new_labels.append(-100)
        prev = w_id

    enc["labels"] = new_labels
    return enc

In [30]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, labels = self.data[idx]
        enc = tokenize_and_align(tokens, labels)
        return {k: torch.tensor(v) for k, v in enc.items()}

train_dataset = NERDataset(train_sentences)
val_dataset   = NERDataset(val_sentences)
test_dataset  = NERDataset(test_sentences)

In [None]:
def collate_fn(batch):
    
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [32]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [33]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(labels_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
gradient_accumulation_steps = 2

In [35]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
gradient_accumulation_steps = 2

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
num_epochs = 5

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        
        outputs = model(**batch)
        loss = outputs.loss / gradient_accumulation_steps
        loss.backward()

        
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        
        total_loss += loss.item() * gradient_accumulation_steps

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/5, Loss: 0.1385
Epoch 2/5, Loss: 0.0418
Epoch 3/5, Loss: 0.0268
Epoch 4/5, Loss: 0.0192
Epoch 5/5, Loss: 0.0133


for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

In [38]:
from seqeval.metrics import classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch, sent in zip(test_loader, test_sentences):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        preds = logits.argmax(dim=-1).cpu().numpy()
        labs  = batch["labels"].cpu().numpy()

        for p_seq, l_seq in zip(preds, labs):
            pred_tags = []
            true_tags = []
            for p, l in zip(p_seq, l_seq):
                if l != -100:
                    pred_tags.append(id2label[int(p)])
                    true_tags.append(id2label[int(l)])
            all_preds.append(pred_tags)
            all_labels.append(true_tags)

print(classification_report(all_labels, all_preds))

              precision    recall  f1-score   support

         LOC       0.87      0.86      0.86      1084
        MISC       0.68      0.71      0.69       340
         ORG       0.85      0.90      0.88      1400
         PER       0.96      0.98      0.97       735

   micro avg       0.86      0.88      0.87      3559
   macro avg       0.84      0.86      0.85      3559
weighted avg       0.86      0.88      0.87      3559



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report, f1_score as seq_f1, precision_score as seq_precision, recall_score as seq_recall

print("\n===== ENTITY LEVEL REPORT (seqeval) =====")
print(classification_report(all_labels, all_preds))

print("\n===== OVERALL MICRO METRICS =====")
print(f"Micro Precision: {seq_precision(all_labels, all_preds):.4f}")
print(f"Micro Recall:    {seq_recall(all_labels, all_preds):.4f}")
print(f"Micro F1-score:  {seq_f1(all_labels, all_preds):.4f}")

print("\n===== OVERALL MACRO METRICS =====")
print(f"Macro F1-score:  {seq_f1(all_labels, all_preds, average='macro'):.4f}")

print("\n===== OVERALL WEIGHTED METRICS =====")
print(f"Weighted F1-score: {seq_f1(all_labels, all_preds, average='weighted'):.4f}")


total_correct = 0
total_tokens = 0

for true_seq, pred_seq in zip(all_labels, all_preds):
    for t, p in zip(true_seq, pred_seq):
        if t == p:
            total_correct += 1
        total_tokens += 1

accuracy = total_correct / total_tokens

print("\n===== ACCURACY (NER için anlamlı değildir ama ekledim) =====")
print(f"Token Accuracy: {accuracy:.4f}")


===== ENTITY LEVEL REPORT (seqeval) =====
              precision    recall  f1-score   support

         LOC       0.87      0.86      0.86      1084
        MISC       0.68      0.71      0.69       340
         ORG       0.85      0.90      0.88      1400
         PER       0.96      0.98      0.97       735

   micro avg       0.86      0.88      0.87      3559
   macro avg       0.84      0.86      0.85      3559
weighted avg       0.86      0.88      0.87      3559


===== OVERALL MICRO METRICS =====
Micro Precision: 0.8634
Micro Recall:    0.8842
Micro F1-score:  0.8737

===== OVERALL MACRO METRICS =====
Macro F1-score:  0.8501

===== OVERALL WEIGHTED METRICS =====
Weighted F1-score: 0.8738

===== ACCURACY (NER için anlamlı değildir ama ekledim) =====
Token Accuracy: 0.9849
