In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount("/content/drive")

print("=== Loading Model and Resources for Analysis ===")

# Define your specific paths
model_path = "/content/drive/My Drive/Colab Notebooks/Week 8 banking77/models/banking77-classifier"
class_weights_path = (
    "/content/drive/My Drive/Colab Notebooks/Week 8 banking77/class_weights.pkl"
)

# Load model and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

print(f"‚úÖ Model loaded: {model.num_parameters():,} parameters")

# Load class weights and label mappings
import pickle

with open(class_weights_path, "rb") as f:
    class_data = pickle.load(f)

class_weights = class_data["class_weights"]
id2label = class_data["id2label"]
label2id = class_data["label2id"]

print(f"‚úÖ Class weights loaded: {len(class_weights)} classes")
print(f"‚úÖ Label mappings loaded: {len(id2label)} intent types")

Mounted at /content/drive
=== Loading Model and Resources for Analysis ===
‚úÖ Model loaded: 67,012,685 parameters
‚úÖ Class weights loaded: 77 classes
‚úÖ Label mappings loaded: 77 intent types


In [2]:
from datasets import load_from_disk

# Update this path to where you saved your tokenized datasets
tokenized_data_path = "/content/drive/My Drive/Colab Notebooks/Week 8 banking77/models/banking77-classifier/tokenized_datasets"
tokenized_datasets = load_from_disk(tokenized_data_path)

print(f"‚úÖ Tokenized data loaded:")
print(f"   - Test set: {len(tokenized_datasets['test'])} examples")
print(f"   - Train set: {len(tokenized_datasets['train'])} examples")

‚úÖ Tokenized data loaded:
   - Test set: 3080 examples
   - Train set: 10003 examples


In [3]:
import torch
from torch.utils.data import DataLoader
import numpy as np
from transformers import DataCollatorWithPadding  # Import DataCollatorWithPadding


def get_predictions(model, dataset, batch_size=16):
    """Get predictions without using Trainer - direct model inference"""
    # Use a data collator to handle padding and tensor conversion
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(
        dataset, batch_size=batch_size, collate_fn=data_collator
    )  # Add collate_fn
    model.eval()

    all_logits = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            # Move batch to the same device as the model
            batch = {
                k: v.to(model.device) for k, v in batch.items()
            }  # Move batch to device

            outputs = model(**batch)
            all_logits.append(outputs.logits.cpu())
            all_labels.append(
                batch["labels"].cpu()
            )  # Get labels from the collated batch

    logits = torch.cat(all_logits).numpy()
    labels = torch.cat(all_labels).numpy()
    predictions = np.argmax(logits, axis=1)

    return predictions, labels, logits


print("üß™ Running predictions on test set...")
predictions, true_labels, logits = get_predictions(model, tokenized_datasets["test"])
print(f"‚úÖ Predictions complete: {len(predictions)} test examples")

üß™ Running predictions on test set...
‚úÖ Predictions complete: 3080 test examples


In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [5]:
from sklearn.metrics import classification_report, confusion_matrix
import evaluate

print("=== Comprehensive Evaluation ===")

# Calculate metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

accuracy = accuracy_metric.compute(predictions=predictions, references=true_labels)
f1 = f1_metric.compute(
    predictions=predictions, references=true_labels, average="weighted"
)

print(f"üìä Overall Performance:")
print(f"   - Accuracy: {accuracy['accuracy']:.4f}")
print(f"   - Weighted F1: {f1['f1']:.4f}")

# Classification report for first 10 classes
print("\n=== Per-Class Performance (First 10 Classes) ===")
class_report = classification_report(
    true_labels,
    predictions,
    labels=list(range(10)),  # Specify the labels (integer IDs) to include
    target_names=[
        id2label[i] for i in range(10)
    ],  # Provide target_names for these labels
    output_dict=False,
)
print(class_report)

=== Comprehensive Evaluation ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

üìä Overall Performance:
   - Accuracy: 0.8672
   - Weighted F1: 0.8664

=== Per-Class Performance (First 10 Classes) ===
                                                  precision    recall  f1-score   support

                                activate_my_card       1.00      0.90      0.95        40
                                       age_limit       0.93      0.97      0.95        40
                         apple_pay_or_google_pay       1.00      1.00      1.00        40
                                     atm_support       0.97      0.95      0.96        40
                                automatic_top_up       1.00      0.88      0.93        40
         balance_not_updated_after_bank_transfer       0.66      0.72      0.69        40
balance_not_updated_after_cheque_or_cash_deposit       0.89      0.97      0.93        40
                         beneficiary_not_allowed       0.96      0.65      0.78        40
                                 cancel_transfer       0.97      0

In [6]:
from collections import Counter

print("\n=== Recreating Class Distribution ===")
train_labels = tokenized_datasets["train"]["labels"]
label_counts = Counter(train_labels)

print("Class distribution (top 5 and bottom 5):")
print("Most common classes:")
for label_idx, count in label_counts.most_common(5):
    class_name = id2label[label_idx]  # Access using integer key
    print(f"  {class_name}: {count} examples")

print("Least common classes:")
for label_idx, count in label_counts.most_common()[-5:]:
    class_name = id2label[label_idx]  # Access using integer key
    print(f"  {class_name}: {count} examples")

# Get the 10 least common classes (rare classes)
rare_classes = label_counts.most_common()[-10:]

print("\n=== Rare Class Performance Analysis ===")

print("Rare classes (bottom 10 by training count):")
for label_idx, count in rare_classes:
    class_name = id2label[label_idx]  # Access using integer key

    # Calculate performance for this class
    class_mask = true_labels == label_idx
    if class_mask.sum() > 0:  # Ensure we have test examples
        class_accuracy = (predictions[class_mask] == true_labels[class_mask]).mean()
        # Safely access class_weights if it's a list/tensor, otherwise note as N/A
        weight = (
            class_weights[label_idx]
            if isinstance(class_weights, (list, torch.Tensor, np.ndarray))
            and label_idx < len(class_weights)
            else "N/A"
        )

        print(f"  {class_name}:")
        print(f"    - Train examples: {count}")
        print(f"    - Test accuracy: {class_accuracy:.1%}")
        print(f"    - Test examples: {class_mask.sum()}")
        if weight != "N/A":
            # Format tensor values if necessary
            if isinstance(weight, torch.Tensor):
                print(f"    - Class weight: {weight.item():.3f}")
            else:
                print(f"    - Class weight: {weight:.3f}")
    else:
        print(
            f"  {class_name}: No test examples available (had {count} train examples)"
        )


=== Recreating Class Distribution ===
Class distribution (top 5 and bottom 5):
Most common classes:
  card_payment_fee_charged: 187 examples
  direct_debit_payment_not_recognised: 182 examples
  balance_not_updated_after_cheque_or_cash_deposit: 181 examples
  wrong_amount_of_cash_received: 180 examples
  cash_withdrawal_charge: 177 examples
Least common classes:
  lost_or_stolen_card: 82 examples
  card_swallowed: 61 examples
  card_acceptance: 59 examples
  virtual_card_not_working: 41 examples
  contactless_not_working: 35 examples

=== Rare Class Performance Analysis ===
Rare classes (bottom 10 by training count):
  top_up_limits:
    - Train examples: 97
    - Test accuracy: 97.5%
    - Test examples: 40
    - Class weight: 1.339
  get_disposable_virtual_card:
    - Train examples: 97
    - Test accuracy: 75.0%
    - Test examples: 40
    - Class weight: 1.339
  receiving_money:
    - Train examples: 95
    - Test accuracy: 90.0%
    - Test examples: 40
    - Class weight: 1.367
 

In [7]:
print("=== Error Analysis ===")

# Find most confused pairs for rare classes
from collections import Counter

print("Common misclassifications for rare classes:")
for label_idx, count in rare_classes[:5]:  # First 5 rare classes
    class_name = id2label[label_idx]  # Access using integer key

    # Find predictions for this true class
    class_mask = true_labels == label_idx
    if class_mask.sum() > 0:
        wrong_predictions = predictions[class_mask][
            predictions[class_mask] != label_idx
        ]
        if len(wrong_predictions) > 0:
            most_common_wrong = Counter(wrong_predictions).most_common(1)[0]
            wrong_class_name = id2label[
                most_common_wrong[0]
            ]  # Access using integer key
            wrong_count = most_common_wrong[1]
            total_class_examples = (
                class_mask.sum()
            )  # Use class_mask.sum() for total examples in this class

            print(
                f"  {class_name} ‚Üí {wrong_class_name}: {wrong_count}/{total_class_examples} times"
            )
        else:
            print(f"  {class_name}: No misclassifications")
    else:
        print(f"  {class_name}: No test examples available for error analysis")

=== Error Analysis ===
Common misclassifications for rare classes:
  top_up_limits ‚Üí getting_spare_card: 1/40 times
  get_disposable_virtual_card ‚Üí getting_virtual_card: 5/40 times
  receiving_money ‚Üí exchange_via_app: 2/40 times
  atm_support ‚Üí declined_cash_withdrawal: 2/40 times
  compromised_card ‚Üí terminate_account: 4/40 times


In [8]:
print("=== Sample Predictions ===")

test_samples = [
    "I need to check my account balance",
    "My card was stolen what should I do",
    "I want to apply for a loan",
    "How do I transfer money to another account",
    "My card is about to expire",  # This was a rare class!
]

for i, sample in enumerate(test_samples):
    inputs = tokenizer(sample, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

    predicted_class = outputs.logits.argmax(-1).item()
    confidence = probabilities.max().item()
    class_name = id2label[predicted_class]  # Access using integer key

    print(f"\n{i+1}. '{sample}'")
    print(f"   ‚Üí {class_name} (confidence: {confidence:.3f})")

    # Show top 3 predictions for low confidence
    if confidence < 0.7:
        top3_probs, top3_indices = torch.topk(probabilities, 3)
        print(f"   Low confidence - Top 3 alternatives:")
        for j, (prob, idx) in enumerate(zip(top3_probs[0], top3_indices[0])):
            alt_class = id2label[idx.item()]  # Access using integer key
            print(f"     {j+1}. {alt_class} ({prob:.3f})")

=== Sample Predictions ===

1. 'I need to check my account balance'
   ‚Üí balance_not_updated_after_cheque_or_cash_deposit (confidence: 0.133)
   Low confidence - Top 3 alternatives:
     1. balance_not_updated_after_cheque_or_cash_deposit (0.133)
     2. transfer_into_account (0.113)
     3. top_up_by_cash_or_cheque (0.077)

2. 'My card was stolen what should I do'
   ‚Üí lost_or_stolen_card (confidence: 0.569)
   Low confidence - Top 3 alternatives:
     1. lost_or_stolen_card (0.569)
     2. card_not_working (0.055)
     3. card_arrival (0.050)

3. 'I want to apply for a loan'
   ‚Üí receiving_money (confidence: 0.114)
   Low confidence - Top 3 alternatives:
     1. receiving_money (0.114)
     2. transfer_fee_charged (0.100)
     3. transfer_into_account (0.097)

4. 'How do I transfer money to another account'
   ‚Üí transfer_into_account (confidence: 0.695)
   Low confidence - Top 3 alternatives:
     1. transfer_into_account (0.695)
     2. balance_not_updated_after_bank_transfe

In [10]:
try:
    from huggingface_hub import notebook_login

    notebook_login()

    # Push to Hub
    model.push_to_hub("zhanghanxue/banking77-weighted-classifier")
    tokenizer.push_to_hub("zhanghanxue/banking77-weighted-classifier")
    print("‚úÖ Model pushed to Hugging Face Hub!")

except Exception as e:
    print(f"Note: Model not pushed to Hub. {e}")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...v6f4n0t/model.safetensors:   0%|          |  573kB /  268MB            

README.md: 0.00B [00:00, ?B/s]

‚úÖ Model pushed to Hugging Face Hub!
