In [None]:
!pip install transformers datasets seqeval torch



In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
!pip uninstall wandb -y

[0m

In [None]:
from datasets import load_dataset, ClassLabel, DatasetDict, Sequence, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report
from sklearn.model_selection import train_test_split
import random
import torch
import pandas as pd
import ast

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
from google.colab import files
uploaded = files.upload()
my_data = load_dataset("csv", data_files="ner_tag_food_log.csv")

In [None]:
my_data = my_data.remove_columns(["input"])

In [None]:
my_data = [dict(example) for example in my_data["train"]]

In [None]:
train_val, test = train_test_split(my_data, test_size=0.1, random_state=42)
train, val = train_test_split(train_val, test_size=0.1, random_state=42)

In [None]:
dataset = DatasetDict({"train": Dataset.from_list(train), "validation": Dataset.from_list(val), "test": Dataset.from_list(test)})

In [None]:
print(dataset)

In [None]:
import ast
def fix_tokens(example):
  example["tokens"] = ast.literal_eval(example["tokens"])
  return example

dataset = dataset.map(fix_tokens)

In [None]:
def fix_tags(example):
  example["ner_tags"] = ast.literal_eval(example["ner_tags"])
  return example

dataset = dataset.map(fix_tags)

In [None]:
unique_tags = set(tag for example in dataset["train"] for tag in example["ner_tags"])
label_list = sorted(list(unique_tags))
label_to_id = {tag: i for i, tag in enumerate(label_list)}
id_to_label = {i: tag for tag, i in label_to_id.items()}

In [None]:
print(unique_tags)

In [None]:
def encode_tags(example):
    example["ner_tags"] = [label_to_id[tag] for tag in example["ner_tags"]]
    return example

dataset = dataset.map(encode_tags)

In [None]:
print(dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
example  = dataset['train'][0]
tokenized_input = tokenizer(example['tokens'], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
word_ids = tokenized_input.word_ids()
word_ids

In [None]:
print(example['ner_tags']), print(tokenized_input['input_ids'])

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 is_split_into_words=True,
                                 padding="max_length",  # ✅ Pad here
                                 truncation=True,
                                 max_length=128,
                                 return_tensors="np")  # ✅ Keep it NumPy-safe if Trainer will convert to tensors

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(labels[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        # ✅ Pad labels to match input length
        # If labels are shorter than tokenized input, pad with -100
        padding_length = len(tokenized_inputs["input_ids"][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        all_labels.append(label_ids)
        print(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)



In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_list),          # ✅ Add this
    label2id=label_to_id,                # ✅ Optional, but helps for logging
    id2label=id_to_label
)

In [None]:
from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)

    # Convert label IDs to strings using your id_to_label mapping
    true_labels = [
        [id_to_label[label_id] for label_id in label if label_id != -100]
        for label in labels
    ]

    true_preds = [
        [id_to_label[pred_id] for pred_id, label_id in zip(pred, label) if label_id != -100]
        for pred, label in zip(preds, labels)
    ]

    # Compute F1 score directly
    return {
        "f1": f1_score(true_labels, true_preds),
        "report": classification_report(true_labels, true_preds, output_dict=False)
    }


In [None]:
args = TrainingArguments(
    output_dir = "./ner-model",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.1,
    save_total_limit = 2,
    logging_dir = "./logs"
)

In [None]:
trainer = Trainer(model=model, args=args,
                train_dataset = tokenized_dataset["train"],
                eval_dataset = tokenized_dataset["validation"],
                tokenizer=tokenizer,
                compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
label_list = tokenized_dataset["train"].features["ner_tags"].feature.names
print(label_list)
print(len(label_list))  # This should be passed as num_labels


In [None]:
print(tokenized_dataset["train"].features)


In [None]:
from transformers import pipeline

token_classifier = pipeline("ner", model="./ner-model", tokenizer="./ner-model", aggregation_strategy="simple")
sentence = "I went for a 2km run this morning"
tokens = token_classifier(sentence)
tokens

token = tokens[0]
# Start and end provide an easy way to highlight words in the original text.
sentence[token["start"] : token["end"]]
