In [1]:
import transformers
import datasets

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

Transformers version: 4.57.1
Datasets version: 4.0.0


In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("banking77")

# Basic exploration
print("=== Dataset Structure ===")
print(dataset)

print("\n=== Dataset Details ===")
print(f"Training samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")
print(f"Number of classes: {len(dataset['train'].features['label'].names)}")

# See actual class names
print(f"First 10 class names: {dataset['train'].features['label'].names[:10]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/298k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/93.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

=== Dataset Structure ===
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

=== Dataset Details ===
Training samples: 10003
Test samples: 3080
Number of classes: 77
First 10 class names: ['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire']


In [4]:
import matplotlib.pyplot as plt
from collections import Counter

# Analyze class distribution
train_labels = dataset["train"]["label"]
label_counts = Counter(train_labels)

print("=== Class Distribution Analysis ===")
print(f"Most common classes: {label_counts.most_common(5)}")
print(f"Least common classes: {label_counts.most_common()[-5:]}")

print("\n=== Class Distribution with Names ===")

print("\n📈 MOST COMMON CLASSES:")
for label_idx, count in label_counts.most_common(5):
    class_name = dataset["train"].features["label"].names[label_idx]
    print(f"  {class_name} (Label {label_idx}): {count} examples")

print("\n📉 LEAST COMMON CLASSES:")
for label_idx, count in label_counts.most_common()[-5:]:
    class_name = dataset["train"].features["label"].names[label_idx]
    print(f"  {class_name} (Label {label_idx}): {count} examples")

# Text length analysis
text_lengths = [len(text.split()) for text in dataset["train"]["text"]]
print(f"\n=== Text Length Analysis ===")
print(f"Average words per query: {sum(text_lengths)/len(text_lengths):.1f}")
print(f"Min words: {min(text_lengths)}, Max words: {max(text_lengths)}")

# Sample examination
print("\n=== Sample Queries ===")
for i in range(3):
    text = dataset["train"][i]["text"]
    label = dataset["train"][i]["label"]
    label_name = dataset["train"].features["label"].names[label]
    print(f"Sample {i}: '{text}' → {label_name} (label {label})")

# Calculate imbalance metrics
total_samples = len(train_labels)
max_samples = max(label_counts.values())
min_samples = min(label_counts.values())

print(f"\n⚖️ IMBALANCE ANALYSIS:")
print(f"Total training samples: {total_samples}")
print(f"Largest class: {max_samples} samples")
print(f"Smallest class: {min_samples} samples")
print(f"Imbalance ratio: {max_samples/min_samples:.1f}x")
print(f"Average samples per class: {total_samples/77:.1f}")

# Check how many classes are below average
avg_samples = total_samples / 77
below_avg = sum(1 for count in label_counts.values() if count < avg_samples)
print(f"Classes below average: {below_avg}/77 ({below_avg/77*100:.1f}%)")

=== Class Distribution Analysis ===
Most common classes: [(15, 187), (28, 182), (6, 181), (75, 180), (19, 177)]
Least common classes: [(41, 82), (18, 61), (10, 59), (72, 41), (23, 35)]

=== Class Distribution with Names ===

📈 MOST COMMON CLASSES:
  card_payment_fee_charged (Label 15): 187 examples
  direct_debit_payment_not_recognised (Label 28): 182 examples
  balance_not_updated_after_cheque_or_cash_deposit (Label 6): 181 examples
  wrong_amount_of_cash_received (Label 75): 180 examples
  cash_withdrawal_charge (Label 19): 177 examples

📉 LEAST COMMON CLASSES:
  lost_or_stolen_card (Label 41): 82 examples
  card_swallowed (Label 18): 61 examples
  card_acceptance (Label 10): 59 examples
  virtual_card_not_working (Label 72): 41 examples
  contactless_not_working (Label 23): 35 examples

=== Text Length Analysis ===
Average words per query: 11.9
Min words: 2, Max words: 79

=== Sample Queries ===
Sample 0: 'I am still waiting on my card?' → card_arrival (label 11)
Sample 1: 'What can

In [5]:
from transformers import AutoTokenizer

# Load DistilBERT tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("=== Tokenizer Information ===")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
print(f"Special tokens: {tokenizer.all_special_tokens}")

# Test different tokenization scenarios
test_queries = [
    "I want to check my account balance",
    "My card was stolen! What should I do?",
    "International wire transfer to Germany",
]

print("\n=== Tokenization Examples ===")
for query in test_queries:
    tokens = tokenizer.tokenize(query)
    token_ids = tokenizer.encode(query)

    print(f"Query: '{query}'")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {token_ids}")
    print(f"Length: {len(tokens)} tokens")
    print(f"Decoded: {tokenizer.decode(token_ids)}")
    print("---")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

=== Tokenizer Information ===
Tokenizer vocab size: 30522
Special tokens: ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

=== Tokenization Examples ===
Query: 'I want to check my account balance'
Tokens: ['i', 'want', 'to', 'check', 'my', 'account', 'balance']
Token IDs: [101, 1045, 2215, 2000, 4638, 2026, 4070, 5703, 102]
Length: 7 tokens
Decoded: [CLS] i want to check my account balance [SEP]
---
Query: 'My card was stolen! What should I do?'
Tokens: ['my', 'card', 'was', 'stolen', '!', 'what', 'should', 'i', 'do', '?']
Token IDs: [101, 2026, 4003, 2001, 7376, 999, 2054, 2323, 1045, 2079, 1029, 102]
Length: 10 tokens
Decoded: [CLS] my card was stolen! what should i do? [SEP]
---
Query: 'International wire transfer to Germany'
Tokens: ['international', 'wire', 'transfer', 'to', 'germany']
Token IDs: [101, 2248, 7318, 4651, 2000, 2762, 102]
Length: 5 tokens
Decoded: [CLS] international wire transfer to germany [SEP]
---


In [6]:
def tokenize_function(batch):
    """Tokenize a batch of examples with proper formatting"""
    texts = batch["text"]
    labels = batch["label"]

    # Ensure we have a list of strings
    if isinstance(texts, str):
        texts = [texts]

    # Ensure labels is a list (for consistency)
    if not isinstance(labels, list):
        labels = [labels]

    # Tokenize with truncation and padding
    tokenized = tokenizer(texts, padding=True, truncation=True, max_length=128)

    # Add labels to the output
    tokenized["labels"] = labels
    return tokenized


# Test the function
print("🧪 Testing tokenization function...")
small_batch = dataset["train"][:3]  # Test with 3 examples
test_tokenized = tokenize_function(small_batch)

print("✅ Tokenization successful!")
print(f"Processed {len(test_tokenized['input_ids'])} examples")
print(f"Each example has {len(test_tokenized['input_ids'][0])} tokens (with padding)")

🧪 Testing tokenization function...
✅ Tokenization successful!
Processed 3 examples
Each example has 18 tokens (with padding)


In [7]:
# Apply tokenization to entire dataset
print("Starting full dataset tokenization...")
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,  # Process 1000 examples at a time
    remove_columns=dataset["train"].column_names,  # Remove original text columns
)

print("=== Tokenized Dataset ===")
print(tokenized_datasets)

# 🆕 IMPROVED VERIFICATION
print(f"\n=== Tokenized Dataset Verification ===")
print(f"First training example keys: {list(tokenized_datasets['train'][0].keys())}")

# Check the first example in detail
first_example = tokenized_datasets["train"][0]
print(f"Input IDs type: {type(first_example['input_ids'])}")
print(f"Input IDs length: {len(first_example['input_ids'])}")
print(f"Attention mask length: {len(first_example['attention_mask'])}")
print(f"Label: {first_example['labels']}")

# Verify padding is working
print(f"\n=== Padding Check ===")
real_tokens = sum(first_example["attention_mask"])
total_tokens = len(first_example["attention_mask"])
print(f"Real tokens: {real_tokens}, Padding tokens: {total_tokens - real_tokens}")

print("✅ Tokenization completed successfully!")

Starting full dataset tokenization...


Map:   0%|          | 0/10003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

=== Tokenized Dataset ===
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3080
    })
})

=== Tokenized Dataset Verification ===
First training example keys: ['input_ids', 'attention_mask', 'labels']
Input IDs type: <class 'list'>
Input IDs length: 79
Attention mask length: 79
Label: 11

=== Padding Check ===
Real tokens: 10, Padding tokens: 69
✅ Tokenization completed successfully!


In [11]:
output_dir = "/content/drive/My Drive/Colab Notebooks/Week 8 banking77/models/banking77-classifier"
dataset_save_path = "/content/drive/My Drive/Colab Notebooks/Week 8 banking77/models/banking77-classifier/tokenized_datasets"

# Save the tokenized datasets
tokenizer.save_pretrained(output_dir)
tokenized_datasets.save_to_disk(dataset_save_path)
print(f"✅ Tokenized dataset saved to: {dataset_save_path}")

# Also save the label mappings (important!)
import json

label_mappings = {
    "id2label": {
        i: label for i, label in enumerate(dataset["train"].features["label"].names)
    },
    "label2id": {
        label: i for i, label in enumerate(dataset["train"].features["label"].names)
    },
}

with open(f"{dataset_save_path}/label_mappings.json", "w") as f:
    json.dump(label_mappings, f)

print("✅ Label mappings saved")

Saving the dataset (0/1 shards):   0%|          | 0/10003 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3080 [00:00<?, ? examples/s]

✅ Tokenized dataset saved to: /content/drive/My Drive/Colab Notebooks/Week 8 banking77/models/banking77-classifier/tokenized_datasets
✅ Label mappings saved
