Cell 1: Setup and Imports

In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies
!pip install -q transformers torch safetensors pytorch-crf scikit-learn

import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from safetensors.torch import load_file
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import defaultdict
import torch.nn as nn
from transformers import AutoModel
from torchcrf import CRF

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Set your drive path
drive_path = "/content/drive/MyDrive/intent_project"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Device: cuda


Cell 2: Define CRF Model Class


In [None]:
class XLMRobertaWithCRF(nn.Module):
    def __init__(self, model_name, num_labels, id2label, label2id):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        self.id2label = id2label
        self.label2id = label2id
        self.num_labels = num_labels

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        sequence_output = outputs
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            labels = torch.where(labels == -100, torch.zeros_like(labels), labels)
            loss = -self.crf(logits, labels, mask=attention_mask.bool(), reduction='mean')
            return {'loss': loss, 'logits': logits}
        else:
            predictions = self.crf.decode(logits, mask=attention_mask.bool())
            return {'logits': logits, 'predictions': predictions}

print("✓ CRF model class defined")


✓ CRF model class defined


Cell 3: Load Intent Classification Model

In [None]:
print("\n=== Loading Intent Classification Model ===")

intent_model_dir = f'{drive_path}/xlm-roberta-intent-classifier-final'

intent_tokenizer = AutoTokenizer.from_pretrained(intent_model_dir)
intent_model = AutoModelForSequenceClassification.from_pretrained(intent_model_dir)
intent_model.to(device)
intent_model.eval()

# Load intent mappings
with open(f'{intent_model_dir}/intent2id.json', 'r') as f:
    intent2id = json.load(f)
with open(f'{intent_model_dir}/id2intent.json', 'r') as f:
    id2intent = json.load(f)

print(f"✓ Intent model loaded ({len(id2intent)} intents)")



=== Loading Intent Classification Model ===
✓ Intent model loaded (60 intents)


Cell 4: Load Slot Filling Model

In [None]:
print("\n=== Loading CRF Slot Filling Model ===")

slot_model_dir = f"{drive_path}/slot_filling_model_crf/final_model"

with open(f"{slot_model_dir}/id2label.json", 'r') as f:
    id2slot = json.load(f)
with open(f"{slot_model_dir}/label2id.json", 'r') as f:
    slot2id = json.load(f)

slot_tokenizer = AutoTokenizer.from_pretrained(slot_model_dir)

slot_model = XLMRobertaWithCRF(
    model_name="xlm-roberta-base",
    num_labels=len(slot2id),
    id2label=id2slot,
    label2id=slot2id
)

model_state = load_file(f"{slot_model_dir}/model.safetensors")
slot_model.load_state_dict(model_state)
slot_model.to(device)
slot_model.eval()

print(f"✓ Slot filling CRF model loaded ({len(slot2id)} labels)")



=== Loading CRF Slot Filling Model ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

✓ Slot filling CRF model loaded (111 labels)


Cell 5: Define Prediction Functions

In [None]:
def predict_intent(utterance):
    """Predict intent for a single utterance"""
    inputs = intent_tokenizer(
        utterance,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    ).to(device)

    with torch.no_grad():
        outputs = intent_model(**inputs)
        logits = outputs.logits
        pred_id = logits.argmax(dim=-1).item()
        pred_label = id2intent[str(pred_id)]
        return pred_label

def extract_slots_crf(utterance):
    """Extract slots from utterance using CRF model"""
    words = utterance.split()

    inputs = slot_tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        # Forward pass
        transformer_outputs = slot_model.transformer(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask']
        )
        sequence_output = transformer_outputs[0]  # Extract the sequence output tensor
        sequence_output = slot_model.dropout(sequence_output)
        logits = slot_model.classifier(sequence_output)

        # CRF decode
        predictions = slot_model.crf.decode(logits, mask=inputs['attention_mask'].bool())
        predictions = torch.tensor(predictions[0])

    # Get word IDs for alignment
    batch_encoding = slot_tokenizer(
        words,
        is_split_into_words=True,
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    word_ids = batch_encoding.word_ids(batch_index=0)

    # Map token predictions back to words
    word_predictions = []
    for word_idx in range(len(words)):
        word_preds = []
        for token_idx, wid in enumerate(word_ids):
            if wid == word_idx:
                word_preds.append(predictions[token_idx])
        if word_preds:
            word_predictions.append(word_preds[0].item())

    # Extract slots from BIO tags
    slots = []
    current_slot_type = None
    current_slot_words = []

    for word, pred_id in zip(words, word_predictions):
        label = id2slot[str(int(pred_id))]

        if label == 'O':
            if current_slot_type:
                slots.append({'type': current_slot_type, 'value': ' '.join(current_slot_words)})
                current_slot_type = None
                current_slot_words = []
        elif label.startswith('B-'):
            if current_slot_type:
                slots.append({'type': current_slot_type, 'value': ' '.join(current_slot_words)})
            current_slot_type = label[2:]
            current_slot_words = [word]
        elif label.startswith('I-'):
            slot_type = label[2:]
            if slot_type == current_slot_type:
                current_slot_words.append(word)
            else:
                if current_slot_type:
                    slots.append({'type': current_slot_type, 'value': ' '.join(current_slot_words)})
                current_slot_type = slot_type
                current_slot_words = [word]

    if current_slot_type:
        slots.append({'type': current_slot_type, 'value': ' '.join(current_slot_words)})

    return slots

print("✓ Prediction functions defined (fixed)")


✓ Prediction functions defined (fixed)


Cell 6: Load Test Data

In [None]:
import os
from datasets import load_dataset

# First, verify the path exists
base_path = "/content/drive/MyDrive/intent_project"
test_dir = os.path.join(base_path, "test")

print(f"Looking for test data at: {test_dir}")
print(f"Path exists: {os.path.exists(test_dir)}")

# List contents to see what's there
if os.path.exists(base_path):
    print(f"\nContents of {base_path}:")
    for item in os.listdir(base_path):
        print(f"  - {item}")


Looking for test data at: /content/drive/MyDrive/intent_project/test
Path exists: True

Contents of /content/drive/MyDrive/intent_project:
  - project_proposal.docx
  - .ipynb_checkpoints
  - data
  - train
  - validation
  - test
  - xlm-roberta-intent-classifier-final
  - NLP_IntentClassification.ipynb
  - combined_nlu_results.json
  - NLP_CombinedIntent+SlotFilling.ipynb
  - massive_51_langs_test.json
  - tokenized_dataset
  - slot_filling_model
  - results_intent_classification
  - team3_crosslingual_eval.json
  - ner_dataset_plain
  - tokenized_ner_dataset
  - slot_filling_model_plaintext
  - slot_filling_model_crf
  - SlotFilling_2.ipynb
  - SlotFilling_CRFvsNonCRF.ipynb
  - NLP_SlotFilling_1st.ipynb
  - CombinedIntent_NERSlotCRF.ipynb
  - app.py
  - splits
  - Project_Summary_Document.pages
  - Project_Summary_Document.pdf


In [None]:
import os
import json
from datasets import load_from_disk

# Load test split from HuggingFace dataset folder format
test_dir = "/content/drive/MyDrive/intent_project/test"

print(f"Loading test split from: {test_dir}")

# HuggingFace datasets saves splits in a specific format
# Try loading it as a dataset split
try:
    dataset = load_from_disk(test_dir)
    print(f"✓ Loaded dataset from disk")
except:
    print("Trying alternative loading method...")
    # If the above fails, list what's in the folder first
    print(f"\nContents of {test_dir}:")
    for item in os.listdir(test_dir):
        print(f"  - {item}")

# Convert to list of dicts
test_data = []
for item in dataset:
    test_data.append({
        'utt': item.get('utt', ''),
        'intent': item.get('intent', ''),
        'annot_utt': item.get('annot_utt', '')
    })

print(f"✓ Converted to list: {len(test_data)} examples")
if test_data:
    print(f"\nSample test example:")
    print(f"  utt: {test_data[0]['utt']}")
    print(f"  intent: {test_data[0]['intent']}")
    print(f"  annot_utt: {test_data[0]['annot_utt'][:100] if test_data[0]['annot_utt'] else 'N/A'}...")


Loading test split from: /content/drive/MyDrive/intent_project/test
✓ Loaded dataset from disk
✓ Converted to list: 59480 examples

Sample test example:
  utt: wake me up at five am this week
  intent: alarm_set
  annot_utt: wake me up at [time : five am] [date : this week]...


Cell 7: Intent Classification Evaluation

In [None]:
print("\n" + "="*70)
print("INTENT CLASSIFICATION EVALUATION")
print("="*70)

true_intents = []
pred_intents = []

for i, example in enumerate(test_data):
    utterance = example['utt']
    true_intent = example['intent']

    pred_intent = predict_intent(utterance)

    true_intents.append(true_intent)
    pred_intents.append(pred_intent)

    if (i + 1) % 100 == 0:
        print(f"Processed {i + 1}/{len(test_data)} examples...")

# Compute metrics
intent_accuracy = accuracy_score(true_intents, pred_intents)

# Micro-averaged metrics
intent_micro_p, intent_micro_r, intent_micro_f1, _ = precision_recall_fscore_support(
    true_intents, pred_intents, average='micro', zero_division=0
)

# Macro-averaged metrics
intent_macro_p, intent_macro_r, intent_macro_f1, _ = precision_recall_fscore_support(
    true_intents, pred_intents, average='macro', zero_division=0
)

# Weighted-averaged metrics
intent_weighted_p, intent_weighted_r, intent_weighted_f1, _ = precision_recall_fscore_support(
    true_intents, pred_intents, average='weighted', zero_division=0
)

print("\n" + "-"*70)
print("INTENT CLASSIFICATION RESULTS")
print("-"*70)
print(f"Accuracy:           {intent_accuracy:.4f}")
print(f"\nMicro-averaged:")
print(f"  Precision:        {intent_micro_p:.4f}")
print(f"  Recall:           {intent_micro_r:.4f}")
print(f"  F1-Score:         {intent_micro_f1:.4f}")
print(f"\nMacro-averaged:")
print(f"  Precision:        {intent_macro_p:.4f}")
print(f"  Recall:           {intent_macro_r:.4f}")
print(f"  F1-Score:         {intent_macro_f1:.4f}")
print(f"\nWeighted-averaged:")
print(f"  Precision:        {intent_weighted_p:.4f}")
print(f"  Recall:           {intent_weighted_r:.4f}")
print(f"  F1-Score:         {intent_weighted_f1:.4f}")



INTENT CLASSIFICATION EVALUATION
Processed 100/59480 examples...
Processed 200/59480 examples...
Processed 300/59480 examples...
Processed 400/59480 examples...
Processed 500/59480 examples...
Processed 600/59480 examples...
Processed 700/59480 examples...
Processed 800/59480 examples...
Processed 900/59480 examples...
Processed 1000/59480 examples...
Processed 1100/59480 examples...
Processed 1200/59480 examples...
Processed 1300/59480 examples...
Processed 1400/59480 examples...
Processed 1500/59480 examples...
Processed 1600/59480 examples...
Processed 1700/59480 examples...
Processed 1800/59480 examples...
Processed 1900/59480 examples...
Processed 2000/59480 examples...
Processed 2100/59480 examples...
Processed 2200/59480 examples...
Processed 2300/59480 examples...
Processed 2400/59480 examples...
Processed 2500/59480 examples...
Processed 2600/59480 examples...
Processed 2700/59480 examples...
Processed 2800/59480 examples...
Processed 2900/59480 examples...
Processed 3000/594

In [None]:
# Parse annotated utterances to extract gold slots
import re

def parse_annotated_utterance(annot_utt):
    """
    Parse annotated utterance to extract gold slots
    Example: "wake me up at [time : 9 AM] on [date : friday]"
    Returns: [{'type': 'time', 'value': '9 AM'}, {'type': 'date', 'value': 'friday'}]
    """
    slots = []
    pattern = r'\[([^:]+):\s*([^\]]+)\]'
    matches = re.findall(pattern, annot_utt)
    for slot_type, slot_value in matches:
        slots.append({
            'type': slot_type.strip(),
            'value': slot_value.strip()
        })
    return slots

print("✓ parse_annotated_utterance function defined")


✓ parse_annotated_utterance function defined


Cell 8: Slot Filling Evaluation (Entity-Level)

In [None]:
print("\n" + "="*70)
print("SLOT FILLING EVALUATION (Entity-Level)")
print("="*70)

all_true_slots = []
all_pred_slots = []

for i, example in enumerate(test_data):
    utterance = example['utt']
    annot_utt = example.get('annot_utt', '')

    # Extract ground truth slots
    true_slots = parse_annotated_utterance(annot_utt)

    # Predict slots
    pred_slots = extract_slots_crf(utterance)

    all_true_slots.append(true_slots)
    all_pred_slots.append(pred_slots)

    if (i + 1) % 100 == 0:
        print(f"Processed {i + 1}/{len(test_data)} examples...")

def compute_entity_metrics(true_slots_list, pred_slots_list):
    """Compute precision, recall, F1 for entity-level slot filling"""
    tp = 0  # True positives
    fp = 0  # False positives
    fn = 0  # False negatives

    for true_slots, pred_slots in zip(true_slots_list, pred_slots_list):
        true_set = {(s['type'], s['value']) for s in true_slots}
        pred_set = {(s['type'], s['value']) for s in pred_slots}

        tp += len(true_set & pred_set)
        fp += len(pred_set - true_set)
        fn += len(true_set - pred_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1, tp, fp, fn

slot_precision, slot_recall, slot_f1, tp, fp, fn = compute_entity_metrics(
    all_true_slots, all_pred_slots
)

print("\n" + "-"*70)
print("SLOT FILLING RESULTS (Entity-Level)")
print("-"*70)
print(f"True Positives:     {tp}")
print(f"False Positives:    {fp}")
print(f"False Negatives:    {fn}")
print(f"\nPrecision:          {slot_precision:.4f}")
print(f"Recall:             {slot_recall:.4f}")
print(f"F1-Score:           {slot_f1:.4f}")



SLOT FILLING EVALUATION (Entity-Level)
Processed 100/59480 examples...
Processed 200/59480 examples...
Processed 300/59480 examples...
Processed 400/59480 examples...
Processed 500/59480 examples...
Processed 600/59480 examples...
Processed 700/59480 examples...
Processed 800/59480 examples...
Processed 900/59480 examples...
Processed 1000/59480 examples...
Processed 1100/59480 examples...
Processed 1200/59480 examples...
Processed 1300/59480 examples...
Processed 1400/59480 examples...
Processed 1500/59480 examples...
Processed 1600/59480 examples...
Processed 1700/59480 examples...
Processed 1800/59480 examples...
Processed 1900/59480 examples...
Processed 2000/59480 examples...
Processed 2100/59480 examples...
Processed 2200/59480 examples...
Processed 2300/59480 examples...
Processed 2400/59480 examples...
Processed 2500/59480 examples...
Processed 2600/59480 examples...
Processed 2700/59480 examples...
Processed 2800/59480 examples...
Processed 2900/59480 examples...
Processed 30

Cell 9: Per-Slot-Type Metrics

In [None]:
print("\n" + "="*70)
print("PER-SLOT-TYPE METRICS")
print("="*70)

def compute_per_type_metrics(true_slots_list, pred_slots_list):
    """Compute metrics for each slot type separately"""
    slot_type_stats = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})

    for true_slots, pred_slots in zip(true_slots_list, pred_slots_list):
        true_by_type = defaultdict(set)
        pred_by_type = defaultdict(set)

        for slot in true_slots:
            true_by_type[slot['type']].add(slot['value'])

        for slot in pred_slots:
            pred_by_type[slot['type']].add(slot['value'])

        all_types = set(true_by_type.keys()) | set(pred_by_type.keys())

        for slot_type in all_types:
            true_vals = true_by_type[slot_type]
            pred_vals = pred_by_type[slot_type]

            slot_type_stats[slot_type]['tp'] += len(true_vals & pred_vals)
            slot_type_stats[slot_type]['fp'] += len(pred_vals - true_vals)
            slot_type_stats[slot_type]['fn'] += len(true_vals - pred_vals)

    per_type_metrics = {}
    for slot_type, stats in slot_type_stats.items():
        tp = stats['tp']
        fp = stats['fp']
        fn = stats['fn']

        p = tp / (tp + fp) if (tp + fp) > 0 else 0
        r = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0

        per_type_metrics[slot_type] = {
            'precision': p,
            'recall': r,
            'f1': f1,
            'support': tp + fn
        }

    return per_type_metrics

per_type_metrics = compute_per_type_metrics(all_true_slots, all_pred_slots)

print(f"\n{'Slot Type':<20} {'Precision':<12} {'Recall':<12} {'F1':<12} {'Support':<10}")
print("-"*70)

# FIXED LINE HERE:
sorted_types = sorted(per_type_metrics.items(), key=lambda x: x[1]['support'], reverse=True)
for slot_type, metrics in sorted_types[:15]:  # Top 15
    print(f"{slot_type:<20} {metrics['precision']:<12.4f} {metrics['recall']:<12.4f} "
          f"{metrics['f1']:<12.4f} {metrics['support']:<10}")

# Macro-average across slot types
slot_macro_p = np.mean([m['precision'] for m in per_type_metrics.values()])
slot_macro_r = np.mean([m['recall'] for m in per_type_metrics.values()])
slot_macro_f1 = np.mean([m['f1'] for m in per_type_metrics.values()])

print("\n" + "-"*70)
print("SLOT FILLING - MACRO-AVERAGED (across slot types)")
print("-"*70)
print(f"Macro Precision:    {slot_macro_p:.4f}")
print(f"Macro Recall:       {slot_macro_r:.4f}")
print(f"Macro F1-Score:     {slot_macro_f1:.4f}")



PER-SLOT-TYPE METRICS

Slot Type            Precision    Recall       F1           Support   
----------------------------------------------------------------------
date                 0.7660       0.8014       0.7833       8300      
place_name           0.7741       0.7272       0.7499       5583      
event_name           0.7050       0.6577       0.6805       5185      
person               0.7521       0.8209       0.7850       4349      
time                 0.6772       0.6237       0.6493       3827      
media_type           0.7976       0.7676       0.7823       2552      
business_name        0.7370       0.7134       0.7250       1835      
weather_descriptor   0.7238       0.6957       0.7095       1620      
food_type            0.6524       0.7140       0.6818       1430      
transport_type       0.8580       0.8447       0.8513       1281      
list_name            0.6928       0.6766       0.6846       1203      
artist_name          0.6834       0.7303       0.7061

Cell 10: Final Summary and Save Results

In [4]:
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)
print(f"\nIntent Classification:")
print(f"  Accuracy:         {intent_accuracy:.4f}")
print(f"  Micro F1:         {intent_micro_f1:.4f}")
print(f"  Macro F1:         {intent_macro_f1:.4f}")
print(f"\nSlot Filling (Entity-Level):")
print(f"  Precision:        {slot_precision:.4f}")
print(f"  Recall:           {slot_recall:.4f}")
print(f"  F1-Score:         {slot_f1:.4f}")
print(f"\nSlot Filling (Macro across types):")
print(f"  Macro Precision:  {slot_macro_p:.4f}")
print(f"  Macro Recall:     {slot_macro_r:.4f}")
print(f"  Macro F1:         {slot_macro_f1:.4f}")
print("\n" + "="*70)

# Save results to JSON
results = {
    'intent_classification': {
        'accuracy': float(intent_accuracy),
        'micro': {
            'precision': float(intent_micro_p),
            'recall': float(intent_micro_r),
            'f1': float(intent_micro_f1)
        },
        'macro': {
            'precision': float(intent_macro_p),
            'recall': float(intent_macro_r),
            'f1': float(intent_macro_f1)
        },
        'weighted': {
            'precision': float(intent_weighted_p),
            'recall': float(intent_weighted_r),
            'f1': float(intent_weighted_f1)
        }
    },
    'slot_filling': {
        'entity_level': {
            'precision': float(slot_precision),
            'recall': float(slot_recall),
            'f1': float(slot_f1),
            'true_positives': int(tp),
            'false_positives': int(fp),
            'false_negatives': int(fn)
        },
        'macro_across_types': {
            'precision': float(slot_macro_p),
            'recall': float(slot_macro_r),
            'f1': float(slot_macro_f1)
        },
        'per_type_metrics': {k: {kk: float(vv) if kk != 'support' else int(vv)
                                  for kk, vv in v.items()}
                             for k, v in per_type_metrics.items()}
    }
}

results_path = f"{drive_path}/evaluation_results.json"
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✓ Results saved to: {results_path}")



FINAL SUMMARY

Intent Classification:


NameError: name 'intent_accuracy' is not defined