<a href="https://colab.research.google.com/github/thegoodgamer14/tech-spec-extractor/blob/main/baseline_distilbert_conll2003.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/thegoodgamer14/tech-spec-extractor.git
%cd tech-spec-extractor/notebooks

fatal: destination path 'tech-spec-extractor' already exists and is not an empty directory.
/content/tech-spec-extractor/notebooks


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/tech-spec-extractor')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
DATA_DIR = '/content/drive/MyDrive/tech-spec-extractor/data'
MODELS_DIR = '/content/drive/MyDrive/tech-spec-extractor/models'
RESULTS_DIR = '/content/drive/MyDrive/tech-spec-extractor/results'

In [None]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install numpy as np
!pip install pypdf2
!pip install seqeval
!pip install evaluate
!pip install huggingface_hub

In [6]:
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from huggingface_hub import notebook_login

notebook_login()

In [None]:
print("Loading CoNLL-2003 dataset...")
try:
    conll_dataset = load_dataset("conll2003")
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise e

print("Dataset structure:")
print(conll_dataset)

label_list = conll_dataset["train"].features["ner_tags"].feature.names
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}
num_labels = len(label_list)

print(f"\nNER Tags: {label_list}")
print(f"Number of labels: {num_labels}")

print("\nSample from training data:")
sample = conll_dataset["train"][0]
print("Tokens:", sample["tokens"][:10])
print("NER tags:", [label_list[tag] for tag in sample["ner_tags"][:10]])

In [8]:
label_list = conll_dataset["train"].features["ner_tags"].feature.names
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}
num_labels = len(label_list)

print(f"\nNER Tags: {label_list}")
print(f"Number of labels: {num_labels}")
print("\nSample from training data:")
sample = conll_dataset["train"][0]
print("Tokens:", sample["tokens"][:10])
print("NER tags:", [label_list[tag] for tag in sample["ner_tags"][:10]])


NER Tags: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of labels: 9

Sample from training data:
Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
NER tags: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [None]:
model_checkpoint = "distilbert-base-uncased"
print(f"Loading tokenizer and model for: {model_checkpoint}")

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)
print("Model and tokenizer loaded successfully.")

In [None]:
print("Setting up preprocessing function...")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

print("Applying preprocessing to the dataset...")
tokenized_datasets = conll_dataset.map(tokenize_and_align_labels, batched=True)
print("Preprocessing complete.")
print(f"Tokenized dataset structure: {tokenized_datasets}")

print("\nSample from preprocessed training data:")
sample = tokenized_datasets["train"][0]
print(f"Input IDs (first 10): {sample['input_ids'][:10]}")
print(f"Attention Mask (first 10): {sample['attention_mask'][:10]}")
print(f"Labels (first 10): {sample['labels'][:10]}")

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
print("Data collator set up successfully.")

print("Setting up evaluation metrics using 'evaluate' and 'seqeval'...")
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
print("Evaluation metric function defined.")

print("Configuring training arguments...")
output_dir = "./results_baseline"

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_baseline',
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
print("Training arguments configured.")

print("Instantiating Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Trainer instantiated successfully.")

Data collator set up successfully.
Setting up evaluation metrics using 'evaluate' and 'seqeval'...
Evaluation metric function defined.
Configuring training arguments...
Training arguments configured.
Instantiating Trainer...


  trainer = Trainer(


Trainer instantiated successfully.


In [None]:
print("Starting baseline model fine-tuning...")
try:
    train_result = trainer.train()
    print("Training finished successfully!")

    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)

    print("\n--- Training Metrics ---")
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}")

except Exception as e:
    print(f"\nAn error occurred during training: {e}")
    print("This might be due to GPU memory limits. Try reducing batch sizes.")
    raise e

trainer.save_model(f"{output_dir}/final_model")
tokenizer.save_pretrained(f"{output_dir}/final_model")
print(f"Baseline model saved to {output_dir}/final_model")

In [None]:
print("Evaluating the fine-tuned baseline model on the test set...")
eval_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

print("\n--- Baseline Model CoNLL-2003 Test Set Results ---")
print(f"Precision: {eval_results.get('eval_precision', 'N/A'):.4f}")
print(f"Recall:    {eval_results.get('eval_recall', 'N/A'):.4f}")
print(f"F1-Score:  {eval_results.get('eval_f1', 'N/A'):.4f}")
print(f"Accuracy:  {eval_results.get('eval_accuracy', 'N/A'):.4f}")
print("----------------------------------------------------")

trainer.log_metrics("eval_test", eval_results)
trainer.save_metrics("eval_test", eval_results)

print("Baseline model training and evaluation complete!")