**2: SLOT FILLING WITH CRF**

In [2]:
# Install required packages
!pip install -q transformers datasets torch scikit-learn seaborn matplotlib pandas numpy

In [3]:
import torch
import numpy as np
import pandas as pd
import json
import os
import re
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

print("✓ Libraries imported")

✓ Libraries imported


In [4]:

print("\n=== Loading Existing Datasets ===")

drive_path = '/content/drive/MyDrive/intent_project'

# Check what's in the directory
print(f"\nContents of {drive_path}:")
contents = os.listdir(drive_path)
for item in contents:
    print(f"  - {item}")

# Load the datasets - try different possible names
try:
    # Try loading from a DatasetDict folder
    final_dataset = load_from_disk(f'{drive_path}/final_multilingual_dataset')
    print("\n✓ Loaded from 'final_multilingual_dataset'")
except:
    try:
        # Try loading individual splits
        final_dataset = DatasetDict({
            'train': load_from_disk(f'{drive_path}/train'),
            'validation': load_from_disk(f'{drive_path}/validation'),
            'test': load_from_disk(f'{drive_path}/test')
        })
        print("\n✓ Loaded from individual train/validation/test folders")
    except:
        # Try alternate names
        try:
            final_dataset = DatasetDict({
                'train': load_from_disk(f'{drive_path}/train_dataset'),
                'validation': load_from_disk(f'{drive_path}/val_dataset'),
                'test': load_from_disk(f'{drive_path}/test_dataset')
            })
            print("\n✓ Loaded from train_dataset/val_dataset/test_dataset folders")
        except Exception as e:
            print(f"\n Error loading datasets: {e}")
            print("\nPlease check the exact folder names in your Drive and update the code.")
            raise

print(f"\n✓ Datasets loaded:")
print(f"  Train: {len(final_dataset['train']):,}")
print(f"  Validation: {len(final_dataset['validation']):,}")
print(f"  Test: {len(final_dataset['test']):,}")


=== Loading Existing Datasets ===

Contents of /content/drive/MyDrive/intent_project:
  - Untitled.ipynb
  - project_proposal.docx
  - .DS_Store
  - .ipynb_checkpoints
  - data
  - train
  - validation
  - test
  - tokenized_dataset
  - results_intent_classification
  - xlm-roberta-intent-classifier-final
  - slot_filling_model

✓ Loaded from individual train/validation/test folders

✓ Datasets loaded:
  Train: 230,280
  Validation: 40,660
  Test: 59,480


In [5]:
print("\n=== Creating Slot BIO Tags ===")

def extract_slots_from_annotation(annot_utt):
    """Extract slot types from annotated utterance"""
    pattern = r'\[([^\]]+)\s*:\s*([^\]]+)\]'
    slots = re.findall(pattern, annot_utt)
    return [(slot_type.strip(), slot_value.strip()) for slot_type, slot_value in slots]

# Extract all unique slot types from the entire training dataset
all_slot_types = set()
# sample_size = min(10000, len(final_dataset['train'])) # Removed sampling

for i in range(len(final_dataset['train'])): # Iterate over the full training dataset
    annot_utt = final_dataset['train'][i]['annot_utt']
    slots = extract_slots_from_annotation(annot_utt)
    for slot_type, _ in slots:
        all_slot_types.add(slot_type)

slot_types = sorted(all_slot_types)
print(f"\n✓ Found {len(slot_types)} unique slot types")

# Create BIO tags: O, B-slot, I-slot
bio_tags = ['O'] + [f'B-{slot}' for slot in slot_types] + [f'I-{slot}' for slot in slot_types]
slot2id = {tag: idx for idx, tag in enumerate(bio_tags)}
id2slot = {idx: tag for tag, idx in slot2id.items()}

print(f"✓ Total BIO tags: {len(bio_tags)}")

# Save mappings
os.makedirs(f'{drive_path}/slot_filling_model', exist_ok=True)
with open(f'{drive_path}/slot_filling_model/slot2id.json', 'w') as f:
    json.dump(slot2id, f, indent=2)
with open(f'{drive_path}/slot_filling_model/id2slot.json', 'w') as f:
    json.dump(id2slot, f, indent=2)

print("✓ Slot mappings saved")


=== Creating Slot BIO Tags ===

✓ Found 55 unique slot types
✓ Total BIO tags: 111
✓ Slot mappings saved


In [26]:
# ============================================================
# STEP 4 (FIXED): Better Tokenization and Label Alignment
# ============================================================

print("\n=== FIXED: Tokenizing and Aligning Labels ===")

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_slots_from_annotation_v2(annot_utt):
    """Extract slot types and exact positions from annotated utterance"""
    pattern = r'\[([^\]]+)\s*:\s*([^\]]+)\]'

    # Find all slots with their positions
    slots_with_pos = []
    for match in re.finditer(pattern, annot_utt):
        slot_type = match.group(1).strip()
        slot_value = match.group(2).strip()
        start_pos = match.start()
        end_pos = match.end()
        slots_with_pos.append({
            'type': slot_type,
            'value': slot_value,
            'start': start_pos,
            'end': end_pos
        })

    return slots_with_pos

def tokenize_and_align_labels_v2(examples):
    """Better token-to-label alignment using word positions"""

    tokenized_inputs = tokenizer(
        examples['utt'],
        truncation=True,
        max_length=128,
        padding='max_length',
        return_offsets_mapping=True  # Get character offsets!
    )

    all_labels = []

    for i, (utt, annot_utt) in enumerate(zip(examples['utt'], examples['annot_utt'])):
        # Remove annotations from utterance to get clean text
        clean_utt = re.sub(r'\[[^\]]+:[^\]]+\]', lambda m: m.group(0)[m.group(0).find(':')+1:-1], annot_utt)

        # Extract slots with their text
        slots = extract_slots_from_annotation_v2(annot_utt)

        # Map slot values to their actual position in clean text
        slot_ranges = []
        for slot in slots:
            # Find the slot value in the clean utterance
            slot_value_clean = slot['value']
            pos = clean_utt.find(slot_value_clean)
            if pos != -1:
                slot_ranges.append({
                    'type': slot['type'],
                    'start': pos,
                    'end': pos + len(slot_value_clean)
                })

        # Get offsets for this example
        offsets = tokenized_inputs['offset_mapping'][i]

        labels = []
        for (start_char, end_char) in offsets:
            # Skip special tokens
            if start_char == 0 and end_char == 0:
                labels.append(-100)
                continue

            label = 'O'

            # Check which slot this token belongs to
            for slot_range in slot_ranges:
                # Token overlaps with slot
                if start_char < slot_range['end'] and end_char > slot_range['start']:
                    # First token of slot gets B-, rest get I-
                    if start_char >= slot_range['start']:
                        label = f"B-{slot_range['type']}"
                    else:
                        label = f"I-{slot_range['type']}"
                    break

            labels.append(slot2id[label])

        all_labels.append(labels)

    # Remove offset_mapping as it's not needed for training
    tokenized_inputs.pop('offset_mapping', None)
    tokenized_inputs['labels'] = all_labels

    return tokenized_inputs

# Process datasets with FIXED alignment
print("Processing train split...")
train_sample = final_dataset['train'].select(range(min(15000, len(final_dataset['train']))))
slot_train = train_sample.map(
    tokenize_and_align_labels_v2,
    batched=True,
    remove_columns=train_sample.column_names,
    batch_size=16
)

print("Processing validation split...")
val_sample = final_dataset['validation'].select(range(min(3000, len(final_dataset['validation']))))
slot_val = val_sample.map(
    tokenize_and_align_labels_v2,
    batched=True,
    remove_columns=val_sample.column_names,
    batch_size=16
)

print("Processing test split...")
test_sample = final_dataset['test'].select(range(min(3000, len(final_dataset['test']))))
slot_test = test_sample.map(
    tokenize_and_align_labels_v2,
    batched=True,
    remove_columns=test_sample.column_names,
    batch_size=16
)

slot_dataset = DatasetDict({
    'train': slot_train,
    'validation': slot_val,
    'test': slot_test
})

print("\n✓ IMPROVED tokenization complete")
print(f"  Train: {len(slot_dataset['train']):,}")
print(f"  Validation: {len(slot_dataset['validation']):,}")
print(f"  Test: {len(slot_dataset['test']):,}")



=== FIXED: Tokenizing and Aligning Labels ===
Processing train split...


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Processing validation split...


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Processing test split...


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]


✓ IMPROVED tokenization complete
  Train: 15,000
  Validation: 3,000
  Test: 3,000


In [27]:
print("\n=== Loading Model ===")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(slot2id),
    id2label=id2slot,
    label2id=slot2id
)
model.to(device)

print(f"✓ Model loaded ({model.num_parameters():,} parameters)")

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Flatten and remove ignored labels
    true_labels = []
    pred_labels = []

    for i in range(len(labels)):
        for j in range(len(labels[i])):
            if labels[i][j] != -100:
                true_labels.append(labels[i][j])
                pred_labels.append(predictions[i][j])

    accuracy = accuracy_score(true_labels, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, pred_labels, average='weighted', zero_division=0
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Training arguments
output_dir = f'{drive_path}/slot_filling_model'

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,  # A100 optimized
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=200,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=2,
    fp16=True,
    report_to='none',
    seed=42,
)

print(f"✓ Training configuration ready")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Loading Model ===
Device: cuda
✓ Model loaded (277,538,415 parameters)
✓ Training configuration ready


In [28]:
print("TRAINING SLOT FILLING MODEL")
print("="*70 + "\n")

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=slot_dataset['train'],
    eval_dataset=slot_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

train_result = trainer.train()

print("\n" + "="*70)
print("✅ TRAINING COMPLETE")
print("="*70)
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training time: {train_result.metrics['train_runtime']/3600:.2f} hours")

TRAINING SLOT FILLING MODEL



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7891,0.722866,0.837458,0.803882,0.837458,0.809252
2,0.5349,0.503728,0.882645,0.857369,0.882645,0.865401
3,0.4348,0.463974,0.888659,0.871727,0.888659,0.876494



✅ TRAINING COMPLETE
Training loss: 0.8537
Training time: 0.04 hours


In [29]:
print("\n=== Saving Final Model ===")

final_model_dir = f'{output_dir}/final_model'
os.makedirs(final_model_dir, exist_ok=True)

# Save model and tokenizer
trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

# Ensure config.json exists
config_path = os.path.join(final_model_dir, 'config.json')
if not os.path.exists(config_path):
    model.config.save_pretrained(final_model_dir)

print(f"✓ Model saved to: {final_model_dir}")

# Verify all files saved
print("\nSaved files:")
print(os.listdir(final_model_dir))


=== Saving Final Model ===
✓ Model saved to: /content/drive/MyDrive/intent_project/slot_filling_model/final_model

Saved files:
['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'sentencepiece.bpe.model', 'tokenizer.json', 'training_args.bin', 'test_results.json']


In [30]:
print("\n=== Evaluating on Test Set ===")

test_results = trainer.evaluate(slot_dataset['test'])

print("\nTest Results:")
print(f"  Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"  Precision: {test_results['eval_precision']:.4f}")
print(f"  Recall: {test_results['eval_recall']:.4f}")
print(f"  F1 Score: {test_results['eval_f1']:.4f}")

# Save results
with open(f'{final_model_dir}/test_results.json', 'w') as f:
    json.dump(test_results, f, indent=2)

print("\n" + "="*70)
print(f"\nAll files saved to: {final_model_dir}")


=== Evaluating on Test Set ===



Test Results:
  Accuracy: 0.8913
  Precision: 0.8763
  Recall: 0.8913
  F1 Score: 0.8798


All files saved to: /content/drive/MyDrive/intent_project/slot_filling_model/final_model


**SLOT FILLING INFERENCE TESTING**

In [31]:
import torch
import json
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification
print("\n=== Loading Model and Tokenizer ===")

# Paths
drive_path = '/content/drive/MyDrive/intent_project'
slot_model_dir = f'{drive_path}/slot_filling_model/final_model'

# Install safetensors for loading (if needed)
!pip install -q safetensors

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(slot_model_dir)
print("✓ Tokenizer loaded")

# Load model
slot_model = AutoModelForTokenClassification.from_pretrained(slot_model_dir)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
slot_model.to(device)
slot_model.eval()
print(f"✓ Model loaded on {device}")

# Load slot mappings
with open(f'{drive_path}/slot_filling_model/slot2id.json', 'r') as f:
    slot2id = json.load(f)
with open(f'{drive_path}/slot_filling_model/id2slot.json', 'r') as f:
    id2slot = json.load(f)
print(f"✓ Slot mappings loaded ({len(slot2id)} tags)")



=== Loading Model and Tokenizer ===
✓ Tokenizer loaded
✓ Model loaded on cuda
✓ Slot mappings loaded (111 tags)


In [51]:
def extract_slots(utterance, tokenizer, model, id2slot, device):
    """Production-ready slot extraction with proper joining logic"""

    inputs = tokenizer(
        utterance,
        return_tensors='pt',
        truncation=True,
        max_length=128,
        padding='max_length'
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().cpu().numpy())

    # Decode BIO tags
    slots_raw = []
    current_slot_type = None
    current_slot_tokens = []

    for token, pred_id in zip(tokens, predictions):
        slot_label = id2slot[str(int(pred_id))]

        # Skip special tokens
        if token in ['<s>', '</s>', '<pad>']:
            if current_slot_type and current_slot_tokens:
                slots_raw.append({
                    'type': current_slot_type,
                    'tokens': current_slot_tokens
                })
            current_slot_type = None
            current_slot_tokens = []
            continue

        # B- tag: start new slot
        if slot_label.startswith('B-'):
            # Save previous
            if current_slot_type and current_slot_tokens:
                slots_raw.append({
                    'type': current_slot_type,
                    'tokens': current_slot_tokens
                })

            # Start new
            current_slot_type = slot_label[2:]
            current_slot_tokens = [token]

        # I- tag: continue same type
        elif slot_label.startswith('I-'):
            slot_type = slot_label[2:]
            if slot_type == current_slot_type and current_slot_type:
                current_slot_tokens.append(token)
            else:
                # Type changed
                if current_slot_type and current_slot_tokens:
                    slots_raw.append({
                        'type': current_slot_type,
                        'tokens': current_slot_tokens
                    })
                current_slot_type = slot_type
                current_slot_tokens = [token]

        # O tag: end slot
        elif slot_label == 'O':
            if current_slot_type and current_slot_tokens:
                slots_raw.append({
                    'type': current_slot_type,
                    'tokens': current_slot_tokens
                })
            current_slot_type = None
            current_slot_tokens = []

    # Last slot
    if current_slot_type and current_slot_tokens:
        slots_raw.append({
            'type': current_slot_type,
            'tokens': current_slot_tokens
        })

    # Post-processing: join consecutive slots of same type ONLY if they're subword continuations
    slots_merged = []
    i = 0
    while i < len(slots_raw):
        current = slots_raw[i]
        merged_tokens = list(current['tokens'])

        # Look ahead
        j = i + 1
        while j < len(slots_raw) and slots_raw[j]['type'] == current['type']:
            next_slot = slots_raw[j]
            next_token = next_slot['tokens'][0]

            # Only merge if next token is a continuation (doesn't start with ▁)
            # This means it's a subword fragment of the previous word
            if not next_token.startswith('▁'):
                merged_tokens.extend(next_slot['tokens'])
                j += 1
            else:
                break

        # Convert tokens to string
        slot_value = ''
        for token in merged_tokens:
            if token.startswith('▁'):
                if slot_value:
                    slot_value += ' '
                slot_value += token[1:]
            else:
                slot_value += token

        slot_value = slot_value.strip()
        if slot_value:
            slots_merged.append((current['type'], slot_value))

        i = j

    return slots_merged

In [52]:
# TESTING SLOT FILLING
# =======================

print("\n" + "="*60)
print("PRODUCTION SLOT EXTRACTION")
print("="*60 + "\n")

test_utterances = [
    "Wake me up at 6 AM tomorrow",
    "Order a cappuccino and a croissant",
    "Book a table at an Italian restaurant at 7pm",
    "Play some jazz music by Miles Davis",
    "Remind me to call John next Friday",
    "What's the weather in Paris this weekend",
    "Set alarm for 8 AM on Monday",
]

for i, utterance in enumerate(test_utterances, 1):
    print(f"{i}. Utterance: {utterance}")

    slots = extract_slots(utterance, tokenizer, slot_model, id2slot, device)

    if slots:
        print("   Detected slots:")
        for slot_type, slot_value in slots:
            print(f"     ✓ [{slot_type}]: {slot_value}")
    else:
        print("   No slots detected")

    print("-" * 70)

print("\n Production extraction complete!")



PRODUCTION SLOT EXTRACTION

1. Utterance: Wake me up at 6 AM tomorrow
   Detected slots:
     ✓ [time]: 6
     ✓ [time]: AM
     ✓ [date]: tomorrow
----------------------------------------------------------------------
2. Utterance: Order a cappuccino and a croissant
   Detected slots:
     ✓ [food_type]: cappuccino
     ✓ [food_type]: croissant
----------------------------------------------------------------------
3. Utterance: Book a table at an Italian restaurant at 7pm
   Detected slots:
     ✓ [business_type]: Italian restaurant
     ✓ [time]: 7pm
----------------------------------------------------------------------
4. Utterance: Play some jazz music by Miles Davis
   Detected slots:
     ✓ [music_genre]: jazz
     ✓ [artist_name]: Miles
     ✓ [artist_name]: Davis
----------------------------------------------------------------------
5. Utterance: Remind me to call John next Friday
   Detected slots:
     ✓ [event_name]: call
     ✓ [person]: John
     ✓ [date]: next
     ✓ [da

In [53]:
# STEP 4: Interactive Testing
# ============================================================

print("\n" + "="*70)
print("INTERACTIVE SLOT EXTRACTION")
print("="*70)
print("\nType utterances to extract slots (or 'quit' to exit)\n")

while True:
    user_input = input("Enter utterance: ").strip()

    if user_input.lower() in ['quit', 'exit', 'q', '']:
        print("\n✅ Testing complete!")
        break

    slots = extract_slots(user_input, tokenizer, slot_model, id2slot, device)

    if slots:
        print("\nDetected slots:")
        for slot_type, slot_value in slots:
            print(f"  [{slot_type}]: {slot_value}")
    else:
        print("\nNo slots detected")
    print()



INTERACTIVE SLOT EXTRACTION

Type utterances to extract slots (or 'quit' to exit)

Enter utterance: कल सुबह 6 बजे मुझे जगा देना

Detected slots:
  [date]: कल
  [time]: 6
  [time]: बजे

Enter utterance: రేపు జాన్ కి ఈమెయిల్ పంపమని నాకు గుర్తు చేయి

Detected slots:
  [date]: రేపు
  [person]: జాన్

Enter utterance: remind me to send an email tomorrow to john

Detected slots:
  [date]: tomorrow
  [date]: to
  [person]: john

Enter utterance: quit

✅ Testing complete!
