In [None]:
# Bash cell: install required packages.
# transformers/datasets/tokenizers for model training, seqeval for NER metrics, flask for serving.
!pip install -q transformers datasets tokenizers seqeval accelerate huggingface_hub flask uvicorn[standard] gunicorn


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.7/517.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m456.8/456.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
# Comments: import commonly used libs, set random seeds for reproducibility,
# and set some environment options to reduce tokenizers warnings.
import os
import random
import math
import json
from pprint import pprint

import numpy as np
import torch

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# display device
print("Torch device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Torch device: Tesla T4


In [None]:
# Comments: load the dataset from the HF hub and print schema + a sample example to inspect keys.
from datasets import load_dataset

print("Loading dataset 'gtfintechlab/finer-ord' ... (this may take a few seconds)")
ds = load_dataset("gtfintechlab/finer-ord")

print("\nDataset splits:", list(ds.keys()))
print("\nTrain columns:")
pprint(ds['train'].column_names)

print("\nOne example (train[0]) — truncated display:")
sample = ds['train'][0]
# print only small pieces for readability
for k,v in sample.items():
    if isinstance(v, list):
        print(f" - {k}: [list of length {len(v)}], sample slice: {v[:20]}")
    else:
        print(f" - {k}: {v}")


Loading dataset 'gtfintechlab/finer-ord' ... (this may take a few seconds)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

val.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/80531 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10233 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25957 [00:00<?, ? examples/s]


Dataset splits: ['train', 'validation', 'test']

Train columns:
['gold_label', 'gold_token', 'doc_idx', 'sent_idx']

One example (train[0]) — truncated display:
 - gold_label: 0
 - gold_token: Kenyan
 - doc_idx: 0
 - sent_idx: 0


In [None]:
# Run this cell AFTER you have ds = load_dataset("gtfintechlab/finer-ord")
# It groups token-level rows into sentence-level examples using (doc_idx, sent_idx).

from datasets import DatasetDict, Dataset
from collections import defaultdict, OrderedDict
import numpy as np
from pprint import pprint
import os

def group_token_rows_to_sentences(split_ds):
    """
    Input: split_ds is a datasets.Dataset where each row is a token with fields:
      - gold_token (string or maybe None)
      - gold_label (int)
      - doc_idx, sent_idx (ints used to group tokens)
    Output: list of dicts with fields:
      - tokens: list[str]
      - ner_tags: list[int] (original label ids)
    """
    grouped = OrderedDict()  # preserve order
    # Defensive: pick column names that exist
    token_col = 'gold_token' if 'gold_token' in split_ds.column_names else None
    label_col = 'gold_label' if 'gold_label' in split_ds.column_names else None
    doc_col = 'doc_idx' if 'doc_idx' in split_ds.column_names else None
    sent_col = 'sent_idx' if 'sent_idx' in split_ds.column_names else None

    if token_col is None or label_col is None:
        raise RuntimeError(f"Expected token column 'gold_token' and label column 'gold_label' in split, found: {split_ds.column_names}")

    # iterate through all rows and group by (doc_idx, sent_idx)
    for row in split_ds:
        # determine group key
        d = row.get(doc_col, None)
        s = row.get(sent_col, None)
        group_key = (d, s)

        tok = row.get(token_col, None)
        lab = row.get(label_col, None)

        # Initialize group if not present
        if group_key not in grouped:
            grouped[group_key] = {'tokens': [], 'ner_tags': []}

        # Skip None tokens (defensive) but still keep label length consistent only if token present
        if tok is None:
            # skip adding token and label to avoid None entries
            continue

        # convert token to str just in case
        tok = str(tok)

        # lab should be int-like. Keep original int; remapping will be handled later.
        if lab is None:
            # if label missing, set to 0 (default O-like)
            lab = 0
        else:
            # convert numpy.int64 etc to int
            try:
                lab = int(lab)
            except Exception:
                # if label is string, keep as-is (but we expect ints)
                pass

        grouped[group_key]['tokens'].append(tok)
        grouped[group_key]['ner_tags'].append(lab)

    # Convert ordered groups to list of dicts
    out = []
    for key, v in grouped.items():
        # filter out empty groups
        if len(v['tokens']) == 0:
            continue
        out.append({'tokens': v['tokens'], 'ner_tags': v['ner_tags'], 'doc_idx': key[0], 'sent_idx': key[1]})
    return out

# Build standardized DatasetDict
new_splits = {}
all_label_values = set()
for split in ['train', 'validation', 'test']:
    print(f"Processing split: {split}")
    split_ds = ds[split]
    grouped_examples = group_token_rows_to_sentences(split_ds)
    # collect labels
    for ex in grouped_examples:
        for lab in ex['ner_tags']:
            all_label_values.add(lab)
    # Create a HuggingFace Dataset for this split
    new_splits[split] = Dataset.from_list(grouped_examples)
    print(f"  -> created {len(grouped_examples)} sentence-level examples for split '{split}'")

# Build label mapping: original label ints -> contiguous indices 0..K-1
all_label_values = sorted(list(all_label_values))
print("\nOriginal label id set (sample up to 50):", all_label_values[:50])
orig_to_contig = {orig: idx for idx, orig in enumerate(all_label_values)}
contig_to_orig = {idx: orig for orig, idx in orig_to_contig.items()}

# Create label names preserving original label ids: LABEL_{orig}
label_names = [f"LABEL_{orig}" for orig in all_label_values]
label2id = {name: i for i, name in enumerate(label_names)}
id2label = {i: name for name, i in label2id.items()}

print("\nNumber of distinct original labels:", len(label_names))
print("First 20 label_names:", label_names[:20])

# Now remap ner_tags in the datasets from original ints -> contiguous ids
def remap_example_labels(example):
    remapped = [orig_to_contig[int(x)] for x in example['ner_tags']]
    return {'tokens': example['tokens'], 'ner_tags': remapped, 'doc_idx': example.get('doc_idx'), 'sent_idx': example.get('sent_idx')}

final_splits = {}
for split, ds_split in new_splits.items():
    print(f"Remapping labels for split {split} ...")
    final_splits[split] = ds_split.map(remap_example_labels)

# Build the standardized DatasetDict
ds_standardized = DatasetDict(final_splits)

# Sanity-check a few examples
print("\nStandardization complete. Example outputs:")
for split in ['train','validation','test']:
    if len(ds_standardized[split])>0:
        print(f"\nSplit {split} - first example tokens (len={len(ds_standardized[split][0]['tokens'])}):")
        print(ds_standardized[split][0]['tokens'][:50])
        print("labels (first 50):", ds_standardized[split][0]['ner_tags'][:50])
    else:
        print(f"Split {split} is empty!")

# Assign to ds so subsequent cells use this standardized version
ds = ds_standardized
tokens_col = 'tokens'
labels_col = 'ner_tags'

print("\nYou can now continue the notebook using `ds`, `tokens_col='tokens'`, `labels_col='ner_tags'`.")
print("label_names sample (first 50):", label_names[:50])
print("label2id sample:", dict(list(label2id.items())[:20]))
print("id2label sample:", {k: id2label[k] for k in list(id2label.keys())[:20]})


Processing split: train
  -> created 3262 sentence-level examples for split 'train'
Processing split: validation
  -> created 402 sentence-level examples for split 'validation'
Processing split: test
  -> created 1075 sentence-level examples for split 'test'

Original label id set (sample up to 50): [0, 1, 2, 3, 4, 5, 6]

Number of distinct original labels: 7
First 20 label_names: ['LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6']
Remapping labels for split train ...


Map:   0%|          | 0/3262 [00:00<?, ? examples/s]

Remapping labels for split validation ...


Map:   0%|          | 0/402 [00:00<?, ? examples/s]

Remapping labels for split test ...


Map:   0%|          | 0/1075 [00:00<?, ? examples/s]


Standardization complete. Example outputs:

Split train - first example tokens (len=49):
['Kenyan', 'Firms', 'Eye', 'Deals', 'During', 'Obama', 'Summit', 'Tagged', ':', 'The', 'Global', 'Entrepreneurship', 'Summit', ',', 'launched', 'by', 'President', 'Obama', 'in', '2009', ',', 'brings', 'together', 'entrepreneurs', 'and', 'investors', 'from', 'across', 'Africa', 'and', 'around', 'the', 'world', 'annually', 'to', 'showcase', 'innovative', 'projects', ',', 'exchange', 'new', 'ideas', ',', 'and', 'help', 'spur', 'economic', 'opportunity', '.']
labels (first 50): [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Split validation - first example tokens (len=12):
['Tip', ':', 'Use', 'comma', '(', ',', ')', 'to', 'separate', 'multiple', 'quotes', '.']
labels (first 50): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Split test - first example tokens (len=17):
['H', 'ave', 'you', 'ever', 'felt', 'that'

In [None]:
# Train a WordPiece tokenizer from your standardized dataset (ds with tokens column).
# Adjust VOCAB_SIZE if you want larger/smaller vocab.
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from itertools import chain
import os

VOCAB_SIZE = 16000   # change to 20000-30000 for better coverage if you have time/compute
SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

def dataset_iterator_for_tokenizer(split):
    for ex in ds[split]:
        toks = ex.get('tokens') or ex.get('words')
        if toks and isinstance(toks, list) and len(toks) > 0:
            # join tokens to a whitespace-delimited string (trainer expects raw strings)
            yield " ".join(toks)

print("Preparing tokenizer training...")

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS)

# Chain iterators from available splits (train/validation/test)
iters = chain(
    dataset_iterator_for_tokenizer('train'),
    dataset_iterator_for_tokenizer('validation') if 'validation' in ds else (),
    dataset_iterator_for_tokenizer('test') if 'test' in ds else ()
)

# Train tokenizer (may take ~seconds-to-minutes depending on data and VOCAB_SIZE)
tokenizer.train_from_iterator(iters, trainer=trainer)

os.makedirs("tokenizer_saved", exist_ok=True)
tokenizer_json_path = "tokenizer_saved/tokenizer.json"
tokenizer.save(tokenizer_json_path)

print("Tokenizer training complete.")
print("Tokenizer saved to:", tokenizer_json_path)

# Quick verification: load as PreTrainedTokenizerFast and show example tokens
from transformers import PreTrainedTokenizerFast
tokenizer_fast = PreTrainedTokenizerFast(tokenizer_file=tokenizer_json_path,
                                         unk_token="[UNK]",
                                         pad_token="[PAD]",
                                         cls_token="[CLS]",
                                         sep_token="[SEP]",
                                         mask_token="[MASK]")
tokenizer_fast.model_max_length = 512

print("\nExample tokenization (no special tokens):")
example = "Apple announced a $2 billion acquisition of a fintech startup."
print(" text:", example)
print(" tokens:", tokenizer_fast(example, add_special_tokens=False)["input_ids"][:30])  # prints ids
print(" token->str (first 30 tokens ids mapped back):", [tokenizer_fast.convert_ids_to_tokens(i) for i in tokenizer_fast(example, add_special_tokens=False)["input_ids"][:50]])
print("\nVocab size reported by tokenizer:", tokenizer_fast.vocab_size)


Preparing tokenizer training...
Tokenizer training complete.
Tokenizer saved to: tokenizer_saved/tokenizer.json

Example tokenization (no special tokens):
 text: Apple announced a $2 billion acquisition of a fintech startup.
 tokens: [1379, 1400, 67, 8, 22, 567, 2354, 219, 67, 436, 6674, 3387, 18]
 token->str (first 30 tokens ids mapped back): ['Apple', 'announced', 'a', '$', '2', 'billion', 'acquisition', 'of', 'a', 'fin', '##tech', 'startup', '.']

Vocab size reported by tokenizer: 16000


In [None]:
# Tokenize dataset and align word-level labels -> token-level labels
# Uses tokenizer_fast from previous step and ds with 'tokens' and 'ner_tags'.
import numpy as np
from tqdm.auto import tqdm

MAX_LEN = 128  # change if needed; must match model max position embeddings

def tokenize_and_align_labels(examples):
    # examples['tokens'] is a list of token lists (batch)
    tokenized_inputs = tokenizer_fast(
        examples['tokens'],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

    all_labels = []
    for i, word_labels in enumerate(examples['ner_tags']):
        # map tokens -> word ids: for each tokenized token, which original word index does it come from?
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                # special token like [CLS], [SEP], or padding
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # token is the first token of the word -> use the word label
                label_ids.append(int(word_labels[word_idx]))
            else:
                # token is a subsequent subword piece -> repeat the same word label
                # (alternative: append -100 to ignore subword loss)
                label_ids.append(int(word_labels[word_idx]))
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Apply mapping (batched)
print("Tokenizing and aligning labels (this may take a moment)...")
tokenized_ds = ds.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=ds['train'].column_names  # remove original columns to keep only tokenized features
)

# Quick sanity checks
print("\nTokenized dataset splits and sizes:")
for s in tokenized_ds:
    print(f" - {s}: {len(tokenized_ds[s])} examples")

# Show one example (decoded) to verify alignment
example = tokenized_ds['train'][0]
print("\nSample tokenized example (train[0]):")
print(" tokens (first 60 word-level):", ds['train'][0]['tokens'][:60])
print(" input_ids (first 60 tokens):", example['input_ids'][:60])
print(" tokens mapped back (first 60):", tokenizer_fast.convert_ids_to_tokens(example['input_ids'][:60]))
print(" labels (first 60):", example['labels'][:60])
print("\nNote: -100 indicates tokens ignored during loss computation (special tokens/padding).")

# Save tokenized dataset to disk (optional)
import os
os.makedirs("tokenized_ds", exist_ok=True)
tokenized_ds.save_to_disk("tokenized_ds")
print("\nSaved tokenized dataset to ./tokenized_ds")


Tokenizing and aligning labels (this may take a moment)...


Map:   0%|          | 0/3262 [00:00<?, ? examples/s]

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]


Tokenized dataset splits and sizes:
 - train: 3262 examples
 - validation: 402 examples
 - test: 1075 examples

Sample tokenized example (train[0]):
 tokens (first 60 word-level): ['Kenyan', 'Firms', 'Eye', 'Deals', 'During', 'Obama', 'Summit', 'Tagged', ':', 'The', 'Global', 'Entrepreneurship', 'Summit', ',', 'launched', 'by', 'President', 'Obama', 'in', '2009', ',', 'brings', 'together', 'entrepreneurs', 'and', 'investors', 'from', 'across', 'Africa', 'and', 'around', 'the', 'world', 'annually', 'to', 'showcase', 'innovative', 'projects', ',', 'exchange', 'new', 'ideas', ',', 'and', 'help', 'spur', 'economic', 'opportunity', '.']
 input_ids (first 60 tokens): [15754, 11442, 12227, 15607, 4244, 1846, 5568, 9259, 30, 267, 2585, 11514, 5568, 16, 3889, 283, 1513, 1846, 214, 2261, 16, 6008, 2414, 2569, 226, 731, 317, 1133, 2018, 226, 876, 204, 772, 8156, 218, 6012, 3877, 3911, 16, 2515, 404, 4515, 16, 226, 803, 8041, 942, 2202, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 tokens mapped back (fi

Saving the dataset (0/1 shards):   0%|          | 0/3262 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/402 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1075 [00:00<?, ? examples/s]


Saved tokenized dataset to ./tokenized_ds


In [None]:
# Create a small BERT-like model configuration and initialize the model from scratch.
# Comments:
# - We choose a reduced architecture (hidden_size=256, 6 layers) to make from-scratch training feasible on Colab.
# - num_labels is derived from label_names created earlier.

from transformers import BertConfig, BertForTokenClassification

# Model hyperparameters (tweak if you have more GPU memory)
config = BertConfig(
    vocab_size=tokenizer_fast.vocab_size,  # vocab size from your trained tokenizer
    hidden_size=256,                       # smaller than original BERT (768)
    num_hidden_layers=6,                   # fewer transformer layers
    num_attention_heads=8,                 # must divide hidden_size (256/8=32)
    intermediate_size=1024,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=1,
    num_labels=len(label_names),
    id2label={i: name for i, name in enumerate(label_names)},
    label2id={name: i for i, name in enumerate(label_names)}
)

# Instantiate model (random initialization) for token classification
model = BertForTokenClassification(config)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Model created from scratch. Device:", device)
print("Model config summary: hidden_size={}, layers={}, num_labels={}".format(
    config.hidden_size, config.num_hidden_layers, config.num_labels))


Model created from scratch. Device: cuda
Model config summary: hidden_size=256, layers=6, num_labels=7


In [None]:
# Run this in a Colab cell (bash). Installs evaluate + seqeval (and a version of datasets that still supports load_metric if needed).
# After this completes, run the next Python cell.
!pip install -q evaluate seqeval


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Run this in a Python cell immediately after Cell A completes.
import numpy as np
import sys

# Try to load seqeval using evaluate; if that fails, fallback to a simple token-level metric
use_seqeval = False
try:
    from evaluate import load as evaluate_load
    metric = evaluate_load("seqeval")
    use_seqeval = True
    print("Using evaluate.load('seqeval').")
except Exception as e:
    print("Could not load evaluate.seqeval (will fall back to token-level metrics).")
    print("Error:", e)

# Sanity check for id2label
try:
    id2label
except NameError:
    raise RuntimeError("id2label is not defined. Run the dataset-standardization step that created id2label before this cell.")

if not isinstance(id2label, dict) or not all(isinstance(k, int) for k in id2label.keys()):
    raise RuntimeError(f"id2label must be a dict mapping int->str. Current sample: {list(id2label.items())[:10]}")

print("id2label OK. num labels:", len(id2label))

def simple_token_metrics(true_predictions, true_labels):
    """
    Fallback metric: compute token-level precision/recall/F1 (micro-averaged) ignoring 'O' labels if present.
    Inputs: true_predictions, true_labels are lists of label-name sequences (strings)
    """
    # flatten and ignore empty
    tp = 0
    pred_count = 0
    true_count = 0
    for preds, refs in zip(true_predictions, true_labels):
        for p, r in zip(preds, refs):
            # skip if either is empty (shouldn't happen)
            if p is None or r is None:
                continue
            # consider all labels (or ignore "O" if present)
            if p != "LABEL_0":  # assume LABEL_0 is 'O' in your mapping; adjust if needed
                pred_count += 1
            if r != "LABEL_0":
                true_count += 1
            if p != "LABEL_0" and r != "LABEL_0" and p == r:
                tp += 1
    precision = tp / pred_count if pred_count > 0 else 0.0
    recall = tp / true_count if true_count > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return {"precision": precision, "recall": recall, "f1": f1}

def compute_metrics(p):
    """
    p: tuple (predictions logits, label_ids)
    """
    predictions, label_ids = p
    if predictions is None or label_ids is None:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    if predictions.ndim != 3 or label_ids.ndim != 2:
        raise RuntimeError(f"Unexpected shapes: predictions {getattr(predictions,'shape',None)}, labels {getattr(label_ids,'shape',None)}")

    preds = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

    for pred_row, label_row in zip(preds, label_ids):
        curr_pred = []
        curr_lab = []
        for p_id, l_id in zip(pred_row, label_row):
            if int(l_id) == -100:
                continue
            p_int = int(p_id)
            l_int = int(l_id)
            # clamp predicted id if out of range (defensive)
            if p_int not in id2label:
                p_int = 0
            if l_int not in id2label:
                raise RuntimeError(f"Label id {l_int} not found in id2label mapping.")
            curr_pred.append(id2label[p_int])
            curr_lab.append(id2label[l_int])
        true_predictions.append(curr_pred)
        true_labels.append(curr_lab)

    if use_seqeval:
        # seqeval expects list[str] sequences
        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results.get("overall_precision", 0.0),
            "recall": results.get("overall_recall", 0.0),
            "f1": results.get("overall_f1", 0.0)
        }
    else:
        # fallback micro token-level metrics
        return simple_token_metrics(true_predictions, true_labels)

print("compute_metrics defined (use_seqeval =", use_seqeval, ")")


Downloading builder script: 0.00B [00:00, ?B/s]

Using evaluate.load('seqeval').
id2label OK. num labels: 7
compute_metrics defined (use_seqeval = True )


In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Allocated MB:", torch.cuda.memory_allocated()/1024**2)
    print("Reserved  MB:", torch.cuda.memory_reserved()/1024**2)


CUDA available: True
Allocated MB: 34.59423828125
Reserved  MB: 36.0


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_SILENT"] = "true"


In [None]:
train_result = trainer.train()


Step,Training Loss


Error: You must call wandb.init() before wandb.log()

In [None]:
# Evaluate the current/best model on the validation set and print metrics
eval_metrics = trainer.evaluate()
print("\n=== VALIDATION METRICS ===")
print(eval_metrics)


In [None]:
# Save model + tokenizer to disk (output_dir was used in training args)
output_dir = "./finer-from-scratch-model"
trainer.save_model(output_dir)
tokenizer_fast.save_pretrained(output_dir)
print("Saved model and tokenizer to:", output_dir)


In [None]:
from transformers import pipeline
ner_pipe = pipeline("ner", model=output_dir, tokenizer=output_dir, grouped_entities=True)

sample_texts = [
    "Apple announced a $2 billion acquisition of a fintech startup and AAPL shares rose 3%.",
    "President Obama launched the Global Entrepreneurship Summit in 2009."
]
for t in sample_texts:
    print("\nINPUT:", t)
    print("OUTPUT:", ner_pipe(t))


In [None]:
%%bash
cat > app.py <<'PY'
from flask import Flask, request, jsonify, render_template
from transformers import pipeline
import os

MODEL_DIR = os.environ.get('MODEL_DIR', './finer-from-scratch-model')
app = Flask(__name__, static_folder="static", template_folder="templates")

# Load pipeline once at startup
ner = pipeline("ner", model=MODEL_DIR, tokenizer=MODEL_DIR, grouped_entities=True)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    text = data.get('text', '')
    if not text:
        return jsonify({"error": "No text provided"}), 400
    try:
        entities = ner(text)
        result = []
        for e in entities:
            result.append({
                "entity": e.get("entity_group"),
                "text": e.get("word"),
                "score": float(e.get("score", 0))
            })
        return jsonify({"text": text, "entities": result})
    except Exception as exc:
        return jsonify({"error": str(exc)}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5100, debug=True)
PY

mkdir -p templates static
cat > templates/index.html <<'HTML'
<!doctype html>
<html lang="en">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <title>Financial News NER — Demo</title>
  <link rel="stylesheet" href="/static/style.css" />
</head>
<body>
  <div class="container">
    <h1>Financial News Entity Extractor</h1>
    <textarea id="inputText" placeholder="Paste a financial news paragraph here..." rows="8"></textarea>
    <button id="analyzeBtn">Extract Entities</button>

    <div id="results" class="card hidden">
      <h2>Entities</h2>
      <div id="entityList"></div>
    </div>

    <div class="note">
      <small>Model: trained from scratch on FiNER dataset.</small>
    </div>
  </div>

  <script src="/static/app.js"></script>
</body>
</html>
HTML

cat > static/style.css <<'CSS'
body { font-family: Arial, sans-serif; background:#f4f6fb; color:#1a1a1a; margin:0; padding:40px; }
.container { max-width:900px; margin:0 auto; background:white; padding:24px; border-radius:12px; box-shadow:0 6px 18px rgba(20,20,40,0.08);}
h1 { margin-top:0; }
textarea { width:100%; padding:12px; font-size:15px; border-radius:8px; border:1px solid #ddd; resize:vertical; }
button { margin-top:12px; padding:10px 18px; border:none; border-radius:8px; cursor:pointer; background:#0b63ff; color:white; font-weight:600; }
.card { margin-top:20px; padding:12px; border-radius:8px; background:#fff; border:1px solid #eee;}
.hidden { display:none; }
.entity { padding:8px; border-radius:6px; margin-bottom:8px; background:#f7fbff; border-left:4px solid #0b63ff; }
.entity .meta { font-size:13px; color:#555; }
.note { margin-top:12px; color:#666; font-size:13px; }
CSS

cat > static/app.js <<'JS'
document.getElementById('analyzeBtn').addEventListener('click', async () => {
  const text = document.getElementById('inputText').value.trim();
  if (!text) { alert('Please paste some text.'); return; }
  const resDiv = document.getElementById('results');
  const list = document.getElementById('entityList');
  list.innerHTML = 'Loading...';
  resDiv.classList.remove('hidden');

  try {
    const resp = await fetch('/predict', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ text })
    });
    const data = await resp.json();
    if (data.error) {
      list.innerHTML = `<div class="entity">Error: ${data.error}</div>`;
      return;
    }
    if (!data.entities || data.entities.length === 0) {
      list.innerHTML = '<div class="entity">No entities found.</div>';
      return;
    }
    list.innerHTML = '';
    data.entities.forEach(e => {
      const el = document.createElement('div');
      el.className = 'entity';
      el.innerHTML = `<div><strong>${e.text}</strong></div><div class="meta">${e.entity} — score: ${e.score.toFixed(3)}</div>`;
      list.appendChild(el);
    });
  } catch (err) {
    list.innerHTML = `<div class="entity">Request failed: ${err}</div>`;
  }
});
JS

echo "Flask app and frontend files written: app.py, templates/index.html, static/*"


In [None]:
!python app.py


In [None]:
%%bash
# Overwrite app.py run block to disable debug/reloader when launched from Colab.
# (If you already wrote app.py earlier, this will replace the bottom run block only.)
python - <<'PY'
from pathlib import Path
p = Path("app.py")
text = p.read_text()
# Replace any app.run(...) line to set debug=False and threaded=True
import re
new_text = re.sub(r"if __name__ == .+?:[\\s\\S]+", "if __name__ == \"__main__\":\\n    app.run(host=\"0.0.0.0\", port=5100, debug=False, threaded=True)\\n", text)
p.write_text(new_text)
print("Patched app.py to run with debug=False")
PY


In [None]:
# Install pyngrok
!pip install -q pyngrok


In [None]:
# Run in Colab
!zip -r /content/finer-ner-app.zip app.py templates static finer-from-scratch-model tokenizer_saved


  adding: finer-from-scratch-model/ (stored 0%)
  adding: finer-from-scratch-model/runs/ (stored 0%)
  adding: finer-from-scratch-model/runs/Dec01_05-12-23_7346be78058d/ (stored 0%)
  adding: finer-from-scratch-model/runs/Dec01_05-12-23_7346be78058d/events.out.tfevents.1764565949.7346be78058d.531.0 (deflated 78%)
  adding: tokenizer_saved/ (stored 0%)
  adding: tokenizer_saved/tokenizer.json (deflated 71%)


In [None]:
from google.colab import files
files.download('/content/finer-ner-app.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>