In [None]:
# The original data is no longer supported. It takes a lot more efforts to reconstruct the data.
import requests
import zipfile
import io
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Features, Value

# Download from DeepAI
url = "https://data.deepai.org/conll2003.zip"
response = requests.get(url)
response.raise_for_status()

# Extract zip in memory
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    print("Files in zip:", z.namelist())
    train_text = z.read("train.txt").decode("utf-8")
    valid_text = z.read("valid.txt").decode("utf-8")
    test_text = z.read("test.txt").decode("utf-8")

def parse_conll(text):
    """Parse CoNLL format into all fields: tokens, pos_tags, chunk_tags, ner_tags."""
    sentences = []
    tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
    idx = 0
    
    for line in text.strip().split("\n"):
        if line.startswith("-DOCSTART-") or line == "":
            if tokens:
                sentences.append({
                    "id": str(idx),
                    "tokens": tokens,
                    "pos_tags": pos_tags,
                    "chunk_tags": chunk_tags,
                    "ner_tags": ner_tags,
                })
                tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
                idx += 1
        else:
            parts = line.split()
            if len(parts) >= 4:
                tokens.append(parts[0])
                pos_tags.append(parts[1])
                chunk_tags.append(parts[2])
                ner_tags.append(parts[3])
    
    if tokens:
        sentences.append({
            "id": str(idx),
            "tokens": tokens,
            "pos_tags": pos_tags,
            "chunk_tags": chunk_tags,
            "ner_tags": ner_tags,
        })
    
    return sentences

# Parse all splits
splits = {
    "train": parse_conll(train_text),
    "validation": parse_conll(valid_text),
    "test": parse_conll(test_text),
}

# Define label mappings (matching HF's eriktks/conll2003)
ner_labels = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]

pos_labels = ['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 
              'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 
              'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 
              'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 
              'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

chunk_labels = ['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 
                'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 
                'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP']

ner_label2id = {label: i for i, label in enumerate(ner_labels)}
pos_label2id = {label: i for i, label in enumerate(pos_labels)}
chunk_label2id = {label: i for i, label in enumerate(chunk_labels)}

# Convert string tags to IDs
for split in splits:
    for item in splits[split]:
        item["ner_tags"] = [ner_label2id.get(tag, 0) for tag in item["ner_tags"]]
        item["pos_tags"] = [pos_label2id.get(tag, 0) for tag in item["pos_tags"]]
        item["chunk_tags"] = [chunk_label2id.get(tag, 0) for tag in item["chunk_tags"]]

# Create HF Dataset with proper features
features = Features({
    "id": Value("string"),
    "tokens": Sequence(Value("string")),
    "pos_tags": Sequence(ClassLabel(names=pos_labels)),
    "chunk_tags": Sequence(ClassLabel(names=chunk_labels)),
    "ner_tags": Sequence(ClassLabel(names=ner_labels)),
})

dataset = DatasetDict({
    split: Dataset.from_list(data, features=features)
    for split, data in splits.items()
})

print(dataset)
print(dataset["train"][0])

# Save locally for future use
dataset.save_to_disk("../data/conll2003")

Files in zip: ['metadata', 'test.txt', 'train.txt', 'valid.txt']
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


Saving the dataset (0/1 shards):   0%|          | 0/14041 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3453 [00:00<?, ? examples/s]

In [23]:
dataset["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [25]:
ner_feature = dataset["train"].features["ner_tags"]
ner_feature

List(ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']))

In [30]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [35]:
words = dataset["train"][4]["tokens"]
labels = dataset["train"][4]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_len = max(len(word), len(full_label))
    line1 += word.ljust(max_len + 1)
    line2 += full_label.ljust(max_len + 1)
print(line1)
print(line2)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


In [36]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [37]:
inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [38]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [41]:
# Define a helper function to align labels with tokenized inputs
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_ids != current_word:
            # Start of a new word
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX, change it to I-XXX for subsequent tokens
            if label % 2 == 1:  # Odd index indicates B-XXX
                label += 1
            new_labels.append(label)
    return new_labels

In [42]:
labels = dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
aligned_labels = align_labels_with_tokens(labels, word_ids)
print(aligned_labels)

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [43]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [44]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [45]:
# Finetuning the model with the trainer API
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [46]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [47]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


In [49]:
import evaluate
metric = evaluate.load("seqeval")

In [50]:
labels = dataset["train"][0]["ner_tags"]
labels = [label_names[l] for l in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [52]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': np.float64(1.0),
  'recall': np.float64(0.5),
  'f1': np.float64(0.6666666666666666),
  'number': np.int64(2)},
 'ORG': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(0.6666666666666666),
 'overall_f1': np.float64(0.8),
 'overall_accuracy': 0.8888888888888888}

In [53]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [54]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [55]:
# Set up the model
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
model.config.num_labels

9

In [62]:

# Authenticate Hugging Face Hub
import os
from dotenv import load_dotenv

load_dotenv()

tokenizer.push_to_hub(
    "tensor-polinomics/tokenizer_python_52k",
    token=os.getenv("HF_TOKEN_WRITE")  # Bypasses all cached credentials
)

CommitInfo(commit_url='https://huggingface.co/tensor-polinomics/tokenizer_python_52k/commit/02c25425bb0e3d7b448581ad4472a23b76373de1', commit_message='Upload tokenizer', commit_description='', oid='02c25425bb0e3d7b448581ad4472a23b76373de1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tensor-polinomics/tokenizer_python_52k', endpoint='https://huggingface.co', repo_type='model', repo_id='tensor-polinomics/tokenizer_python_52k'), pr_revision=None, pr_num=None)

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="../data/bert-finetuned-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

trainer.train()

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose "Don't visualize my results"




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0818,0.065672,0.922775,0.929481,0.926116,0.98231
2,0.0383,0.067842,0.940368,0.93944,0.939904,0.985077
3,0.0247,0.066005,0.941729,0.943926,0.942826,0.985798




TrainOutput(global_step=5268, training_loss=0.0723088124346896, metrics={'train_runtime': 10521.092, 'train_samples_per_second': 4.004, 'train_steps_per_second': 0.501, 'total_flos': 920771584279074.0, 'train_loss': 0.0723088124346896, 'epoch': 3.0})

In [68]:
# Save model and tokenizer to disk
trainer.save_model("../data/models/bert-finetuned-ner")
tokenizer.save_pretrained("../data/models/bert-finetuned-ner")

('../data/models/bert-finetuned-ner/tokenizer_config.json',
 '../data/models/bert-finetuned-ner/special_tokens_map.json',
 '../data/models/bert-finetuned-ner/vocab.txt',
 '../data/models/bert-finetuned-ner/added_tokens.json',
 '../data/models/bert-finetuned-ner/tokenizer.json')

In [71]:
from huggingface_hub import whoami

info = whoami()
print(info)  # Check if 'auth' shows write access

{'type': 'user', 'id': '689fea6e1e6dcf030e6e0b99', 'name': 'tensor-polinomics', 'fullname': 'Emma Luo', 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/gKiHqz6Pl_ACJRxiiU_Dk.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'hf_llm', 'role': 'fineGrained', 'createdAt': '2025-12-06T16:39:22.210Z', 'fineGrained': {'canReadGatedRepos': True, 'global': [], 'scoped': [{'entity': {'_id': '689fea6e1e6dcf030e6e0b99', 'type': 'user', 'name': 'tensor-polinomics'}, 'permissions': ['repo.content.read', 'inference.serverless.write']}]}}}}


In [72]:
# Save the most recent version to the Hub
import os
from dotenv import load_dotenv

load_dotenv()

trainer.push_to_hub(
    commit_message="Training complete",
    token=os.getenv("HF_TOKEN_WRITE")  # Bypasses all cached credentials
)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/tensor-polinomics/bert-finetuned-ner/commit/d7ccd15715bdd55b6f2a357e1de8eabd8eb33adb', commit_message='Training complete', commit_description='', oid='d7ccd15715bdd55b6f2a357e1de8eabd8eb33adb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tensor-polinomics/bert-finetuned-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='tensor-polinomics/bert-finetuned-ner'), pr_revision=None, pr_num=None)

# Section 2. A Custom Training Loop

In [None]:
# Build the dataloaders manually (instead of using Trainer)
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator, # define earlier
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=8,
    collate_fn=data_collator, # define earlier
)

In [None]:
# Reinstantiate the model
from transformers import AutoModelForTokenClassification
model_custom = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# Optimizer
from torch.optim import AdamW
optimizer = AdamW(model_custom.parameters(), lr=2e-5)

# Add Accelerator for easy device management
from accelerate import Accelerator
accelerator = Accelerator()
model_custom, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model_custom, optimizer, train_dataloader, eval_dataloader
)

# Set up learning rate scheduler
from transformers import get_scheduler
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Define a helper function to facilitate evaluation
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().numpy()
    labels = labels.detach().cpu().clone().numpy()
    
    # Remove ignored index (special tokens) and covert to labels
    true_labels = [
        [label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

# Build the training loop:
# 1) training itself with train_dataloader
# 2) eval with accelerator.pad_across_processes
# 3) save and upload with repo_push_to_hub
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
        
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        
        # padding
        predictions = accelerator.pad_across_processes(
            predictions, dim=1, pad_index=-100
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        
        true_predictions, true_labels = postprocess(
            predictions_gathered, labels_gathered
        )
        metric.add_batch(
            predictions=true_predictions,
            references=true_labels,
        )
        
    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"] 
            for key in ["precision", "recall", "f1", "accuracy"]
        }
    )
    
# Save and upload
import os
from dotenv import load_dotenv
from huggingface_hub import HfApi, create_repo

load_dotenv()
token = os.getenv("HF_TOKEN_WRITE")

model_name = "bert-finetuned-ner-accelerate"

# Create repo
repo_id = create_repo(model_name, token=token, exist_ok=True).repo_id
print(f"Repo created: {repo_id}")

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(
    "../data/models/bert-finetuned-ner-accelerate",
    save_function=accelerator.save,
)
if accelerator.is_main_process:
    tokenizer.save_pretrained("../data/models/bert-finetuned-ner-accelerate")
    model.push_to_hub(model_name, token=token)
    tokenizer.push_to_hub(model_name, token=token)