# NLP From Scratch: Classifying Names with a Character-Level RNN 🇦🇺

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vuhung16au/pytorch-mastery/blob/main/examples/pytorch-nlp/classify-names-character-level-RNN.ipynb)
[![View on GitHub](https://img.shields.io/badge/View_on-GitHub-blue?logo=github)](https://github.com/vuhung16au/pytorch-mastery/blob/main/examples/pytorch-nlp/classify-names-character-level-RNN.ipynb)

A comprehensive introduction to character-level Recurrent Neural Networks (RNNs) for name classification using PyTorch, featuring Australian names and locations with Vietnamese multilingual support.

## Learning Objectives

By the end of this notebook, you will:

- 🔤 **Master character-level text processing** with PyTorch
- 🧠 **Build RNN from scratch** for sequence classification
- 🇦🇺 **Classify Australian names and locations** by origin/type
- 🌏 **Handle multilingual text** with English-Vietnamese examples
- 🔄 **Compare with TensorFlow** approaches for RNN implementation
- 📊 **Implement comprehensive logging** with TensorBoard

## What You'll Build

1. **Australian Name Origin Classifier** - Classify names by ethnic origin (English, Irish, Greek, Vietnamese, etc.)
2. **Location Type Classifier** - Distinguish between cities, suburbs, landmarks, and natural features
3. **Character-level RNN Architecture** - Build vanilla RNN, LSTM, and GRU variants
4. **Multilingual Support** - Handle both English and Vietnamese character sets

---

In [None]:
# Environment Detection and Setup
import sys
import subprocess
import os
import time

# Detect the runtime environment
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules or "kaggle" in os.environ.get('KAGGLE_URL_BASE', '')
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

print(f"Environment detected:")
print(f"  - Local: {IS_LOCAL}")
print(f"  - Google Colab: {IS_COLAB}")
print(f"  - Kaggle: {IS_KAGGLE}")

# Platform-specific system setup
if IS_COLAB:
    print("\nSetting up Google Colab environment...")
    !apt update -qq
    !apt install -y -qq software-properties-common
elif IS_KAGGLE:
    print("\nSetting up Kaggle environment...")
    # Kaggle usually has most packages pre-installed
else:
    print("\nSetting up local environment...")

In [None]:
# Install required packages for this notebook
required_packages = [
    "torch",
    "pandas",
    "seaborn",
    "matplotlib",
    "tensorboard",
    "scikit-learn",
    "numpy"
]

print("Installing required packages...")
for package in required_packages:
    if IS_COLAB or IS_KAGGLE:
        !pip install -q {package}
    else:
        try:
            subprocess.run([sys.executable, "-m", "pip", "install", "-q", package],
                          capture_output=True, check=True)
        except subprocess.CalledProcessError:
            print(f"Note: {package} installation skipped (likely already installed)")
    print(f"✓ {package}")

print("\n📦 Package installation completed!")

In [None]:
# Import essential libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

# Data handling and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Text processing and utilities
import re
import string
import unicodedata
import random
from collections import defaultdict, Counter
import time
from datetime import datetime
import platform

# Set style for better notebook aesthetics
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print(f"✅ PyTorch {torch.__version__} ready!")
print(f"📊 Libraries imported successfully!")

In [None]:
def detect_device():
    """
    Detect the best available PyTorch device with comprehensive hardware support.
    
    Priority order:
    1. CUDA (NVIDIA GPUs) - Best performance for deep learning
    2. MPS (Apple Silicon) - Optimized for M1/M2/M3 Macs  
    3. CPU (Universal) - Always available fallback
    
    Returns:
        torch.device: The optimal device for PyTorch operations
        str: Human-readable device description for logging
    """
    # Check for CUDA (NVIDIA GPU)
    if torch.cuda.is_available():
        device = torch.device("cuda")
        gpu_name = torch.cuda.get_device_name(0)
        device_info = f"CUDA GPU: {gpu_name}"
        
        # Additional CUDA info for optimization
        cuda_version = torch.version.cuda
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        
        print(f"🚀 Using CUDA acceleration")
        print(f"   GPU: {gpu_name}")
        print(f"   CUDA Version: {cuda_version}")
        print(f"   GPU Memory: {gpu_memory:.1f} GB")
        
        return device, device_info
    
    # Check for MPS (Apple Silicon)
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = torch.device("mps")
        device_info = "Apple Silicon MPS"
        
        # Get system info for Apple Silicon
        system_info = platform.uname()
        
        print(f"🍎 Using Apple Silicon MPS acceleration")
        print(f"   System: {system_info.system} {system_info.release}")
        print(f"   Machine: {system_info.machine}")
        print(f"   Processor: {system_info.processor}")
        
        return device, device_info
    
    # Fallback to CPU
    else:
        device = torch.device("cpu")
        device_info = "CPU (No GPU acceleration available)"
        
        # Get CPU info for optimization guidance
        cpu_count = torch.get_num_threads()
        system_info = platform.uname()
        
        print(f"💻 Using CPU (no GPU acceleration detected)")
        print(f"   Processor: {system_info.processor}")
        print(f"   PyTorch Threads: {cpu_count}")
        print(f"   System: {system_info.system} {system_info.release}")
        
        # Provide optimization suggestions for CPU-only setups
        print(f"\n💡 CPU Optimization Tips:")
        print(f"   • Reduce batch size to prevent memory issues")
        print(f"   • Consider using smaller models for faster training")
        print(f"   • Enable PyTorch optimizations: torch.set_num_threads({cpu_count})")
        
        return device, device_info

# Usage in all PyTorch notebooks
device, device_info = detect_device()
print(f"\n✅ PyTorch device selected: {device}")
print(f"📊 Device info: {device_info}")

# Set global device for the notebook
DEVICE = device

## 🇦🇺 Australian Names and Locations Dataset

We'll create a comprehensive dataset featuring Australian names classified by ethnic origin, and Australian locations classified by type. This follows the repository's Australian context policy while providing practical multilingual examples.

In [None]:
def create_australian_names_dataset():
    """
    Create Australian names dataset with ethnic origin classification.
    Includes both English and Vietnamese names commonly found in Australia.
    """
    
    # Australian names by ethnic origin
    names_by_origin = {
        'English': [
            'Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Wilson', 'Davis', 'Taylor',
            'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson',
            'Garcia', 'Martinez', 'Robinson', 'Clark', 'Lewis', 'Walker', 'Hall',
            'Allen', 'Young', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green',
            'Adams', 'Baker', 'Gonzalez', 'Nelson', 'Carter', 'Mitchell', 'Perez'
        ],
        'Irish': [
            'Murphy', 'Kelly', 'Sullivan', 'Walsh', 'Smith', 'OBrien', 'Byrne', 'Ryan',
            'Connor', 'ONeill', 'Reilly', 'Doyle', 'McCarthy', 'Gallagher', 'Doherty',
            'Kennedy', 'Lynch', 'Murray', 'Quinn', 'Moore', 'McLaughlin', 'Carroll',
            'Connolly', 'Daly', 'Connell', 'Wilson', 'Dunne', 'Brennan', 'Burke',
            'Collins', 'Campbell', 'Clarke', 'Johnston', 'Hughes', 'Farrell'
        ],
        'Italian': [
            'Rossi', 'Russo', 'Ferrari', 'Esposito', 'Bianchi', 'Romano', 'Colombo',
            'Ricci', 'Marino', 'Greco', 'Bruno', 'Gallo', 'Conti', 'DeLuca',
            'Mancini', 'Costa', 'Giordano', 'Rizzo', 'Lombardi', 'Moretti',
            'Barbieri', 'Fontana', 'Santoro', 'Mariani', 'Rinaldi', 'Caruso',
            'Ferrara', 'Galli', 'Martini', 'Leone', 'Longo', 'Gentile', 'Martinelli'
        ],
        'Greek': [
            'Papadopoulos', 'Georgiou', 'Dimitriou', 'Andreou', 'Nikolaou', 'Christou',
            'Ioannou', 'Constantinou', 'Antoniou', 'Savva', 'Charalambous', 'Stylianou',
            'Petrou', 'Michaelidou', 'Hadjisavvas', 'Kokkinos', 'Stavrou', 'Loizou',
            'Panayiotou', 'Economou', 'Demetriou', 'Philippou', 'Vassiliou', 'Kyprianou',
            'Theodorou', 'Christodoulou', 'Anastasiadou', 'Hadjiconstantinou'
        ],
        'Vietnamese': [
            'Nguyen', 'Tran', 'Le', 'Pham', 'Hoang', 'Huynh', 'Vo', 'Vu', 'Dang', 'Bui',
            'Do', 'Ho', 'Ngo', 'Duong', 'Ly', 'Trinh', 'Dinh', 'Thai', 'Cao', 'Lam',
            'Phan', 'Truong', 'Tang', 'Doan', 'Mai', 'Ton', 'Ha', 'Chau', 'Bach', 'Kim',
            'Luu', 'Ong', 'Tong', 'Quan', 'Dam', 'Khang', 'Thang', 'Phung', 'Duc', 'Vinh'
        ],
        'Chinese': [
            'Wang', 'Li', 'Zhang', 'Liu', 'Chen', 'Yang', 'Huang', 'Zhao', 'Wu', 'Zhou',
            'Xu', 'Sun', 'Ma', 'Zhu', 'Hu', 'Guo', 'He', 'Lin', 'Gao', 'Luo',
            'Zheng', 'Liang', 'Xie', 'Tang', 'Song', 'Xu', 'Han', 'Feng', 'Deng', 'Cao',
            'Peng', 'Zeng', 'Xiao', 'Tian', 'Pan', 'Cheng', 'Wei', 'Jiang', 'Yu', 'Shi'
        ]
    }
    
    # Flatten the dataset
    names_data = []
    for origin, names in names_by_origin.items():
        for name in names:
            names_data.append((name, origin))
    
    return names_data

def create_australian_locations_dataset():
    """
    Create Australian locations dataset classified by type.
    Includes cities, suburbs, landmarks, and natural features.
    """
    
    locations_by_type = {
        'City': [
            'Sydney', 'Melbourne', 'Brisbane', 'Perth', 'Adelaide', 'Darwin', 'Hobart', 'Canberra',
            'Newcastle', 'Wollongong', 'Geelong', 'Townsville', 'Cairns', 'Ballarat', 'Bendigo',
            'Albury', 'Wodonga', 'Shepparton', 'Wagga', 'Rockhampton', 'Bundaberg', 'Hervey',
            'Toowoomba', 'Mackay', 'Gladstone', 'Warrnambool', 'Mildura', 'Launceston'
        ],
        'Suburb': [
            'Bondi', 'Manly', 'Paddington', 'Surry', 'Newtown', 'Leichhardt', 'Balmain',
            'Toorak', 'Brighton', 'Camberwell', 'Hawthorn', 'Richmond', 'Fitzroy', 'Carlton',
            'Southbank', 'Docklands', 'Fortitude', 'Paddington', 'Milton', 'Ascot',
            'Cottesloe', 'Subiaco', 'Fremantle', 'Scarborough', 'Joondalup', 'Midland'
        ],
        'Landmark': [
            'Opera', 'Harbour', 'Luna', 'Royal', 'Federation', 'Parliament', 'Story', 'Shrine',
            'Botanic', 'Observatory', 'Anzac', 'War', 'National', 'Australian', 'Museum',
            'Gallery', 'Library', 'University', 'Stadium', 'Arena', 'Centre', 'Tower',
            'Bridge', 'Wharf', 'Market', 'Square', 'Gardens', 'Reserve', 'Park'
        ],
        'Natural': [
            'Uluru', 'Kakadu', 'Daintree', 'Grampians', 'Flinders', 'Cradle', 'Freycinet',
            'Wilsons', 'Kosciuszko', 'Alpine', 'Snowy', 'Murray', 'Darling', 'Cooper',
            'Murrumbidgee', 'Lachlan', 'Macquarie', 'Hawkesbury', 'Yarra', 'Maribyrnong',
            'Barwon', 'Goulburn', 'Campaspe', 'Loddon', 'Wimmera', 'Glenelg', 'Torrens'
        ]
    }
    
    # Flatten the dataset
    locations_data = []
    for location_type, locations in locations_by_type.items():
        for location in locations:
            locations_data.append((location, location_type))
    
    return locations_data

# Create the datasets
names_data = create_australian_names_dataset()
locations_data = create_australian_locations_dataset()

print("🇦🇺 Australian Names and Locations Dataset Created")
print("=" * 55)
print(f"   Names dataset: {len(names_data)} entries")
print(f"   Locations dataset: {len(locations_data)} entries")

# Show sample data
print("\n📝 Sample Names by Origin:")
names_df = pd.DataFrame(names_data, columns=['Name', 'Origin'])
for origin in ['English', 'Vietnamese', 'Greek']:
    samples = names_df[names_df['Origin'] == origin]['Name'].head(3).tolist()
    print(f"   {origin}: {', '.join(samples)}")

print("\n🏛️ Sample Locations by Type:")
locations_df = pd.DataFrame(locations_data, columns=['Location', 'Type'])
for loc_type in ['City', 'Suburb', 'Landmark', 'Natural']:
    samples = locations_df[locations_df['Type'] == loc_type]['Location'].head(3).tolist()
    print(f"   {loc_type}: {', '.join(samples)}")

# Distribution visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Names distribution
sns.countplot(data=names_df, x='Origin', ax=ax1)
ax1.set_title('Australian Names by Ethnic Origin')
ax1.tick_params(axis='x', rotation=45)

# Locations distribution
sns.countplot(data=locations_df, x='Type', ax=ax2)
ax2.set_title('Australian Locations by Type')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\n✅ Datasets prepared for character-level RNN training!")

## 🔤 Character-Level Text Processing

Character-level processing is fundamental to our RNN. We'll create utilities to:

1. **Build character vocabulary** from both English and Vietnamese text
2. **Convert text to tensors** and vice versa
3. **Handle Unicode characters** for multilingual support
4. **Normalize text** for consistent processing

In [None]:
class CharacterProcessor:
    """
    Character-level text processing for Australian multilingual names and locations.
    
    Handles both ASCII and Unicode characters for English-Vietnamese support.
    Comparable to TensorFlow's text preprocessing but with explicit character control.
    """
    
    def __init__(self):
        self.all_letters = string.ascii_letters + " .,;'-"
        self.n_letters = len(self.all_letters)
        self.letter_to_index = {}
        self.index_to_letter = {}
        
    def unicode_to_ascii(self, text):
        """
        Convert Unicode characters to ASCII for Vietnamese names.
        
        Examples:
        - 'Nguyễn' -> 'Nguyen'
        - 'Trần' -> 'Tran'
        """
        return ''.join(
            c for c in unicodedata.normalize('NFD', text)
            if unicodedata.category(c) != 'Mn'
            and c in self.all_letters
        )
    
    def build_vocabulary(self, text_data):
        """
        Build character vocabulary from text data.
        
        Args:
            text_data: List of (text, label) tuples
        """
        all_characters = set()
        
        # Collect all unique characters
        for text, _ in text_data:
            normalized = self.unicode_to_ascii(text)
            all_characters.update(normalized)
        
        # Sort for consistency
        self.all_letters = ''.join(sorted(all_characters))
        self.n_letters = len(self.all_letters)
        
        # Build mappings
        self.letter_to_index = {letter: i for i, letter in enumerate(self.all_letters)}
        self.index_to_letter = {i: letter for i, letter in enumerate(self.all_letters)}
        
        print(f"📝 Character vocabulary built:")
        print(f"   Unique characters: {self.n_letters}")
        print(f"   Character set: {self.all_letters[:50]}{'...' if len(self.all_letters) > 50 else ''}")
    
    def text_to_tensor(self, text):
        """
        Convert text to PyTorch tensor.
        
        TensorFlow equivalent:
            tf.strings.unicode_decode(text, 'UTF-8')
        
        Args:
            text: Input text string
            
        Returns:
            torch.Tensor: Character indices tensor
        """
        normalized = self.unicode_to_ascii(text)
        indices = [self.letter_to_index.get(char, 0) for char in normalized]
        return torch.tensor(indices, dtype=torch.long)
    
    def tensor_to_text(self, tensor):
        """
        Convert tensor back to text string.
        
        Args:
            tensor: PyTorch tensor of character indices
            
        Returns:
            str: Reconstructed text
        """
        indices = tensor.cpu().numpy() if tensor.is_cuda else tensor.numpy()
        return ''.join([self.index_to_letter.get(int(idx), '') for idx in indices])
    
    def char_to_onehot(self, char_index, device=None):
        """
        Convert character index to one-hot vector.
        
        Args:
            char_index: Index of character
            device: PyTorch device for tensor
            
        Returns:
            torch.Tensor: One-hot encoded vector
        """
        if device is None:
            device = torch.device('cpu')
            
        onehot = torch.zeros(self.n_letters, device=device)
        if 0 <= char_index < self.n_letters:
            onehot[char_index] = 1
        return onehot
    
    def text_to_onehot_sequence(self, text, device=None):
        """
        Convert text to sequence of one-hot vectors.
        
        Args:
            text: Input text
            device: PyTorch device
            
        Returns:
            torch.Tensor: Sequence tensor [seq_len, vocab_size]
        """
        if device is None:
            device = torch.device('cpu')
            
        normalized = self.unicode_to_ascii(text)
        sequence_length = len(normalized)
        
        # Create tensor to hold the sequence
        onehot_sequence = torch.zeros(sequence_length, self.n_letters, device=device)
        
        for i, char in enumerate(normalized):
            char_idx = self.letter_to_index.get(char, 0)
            onehot_sequence[i][char_idx] = 1
            
        return onehot_sequence

# Create character processor and build vocabulary
char_processor = CharacterProcessor()

# Combine both datasets for vocabulary building
all_data = names_data + locations_data
char_processor.build_vocabulary(all_data)

# Test character processing
print("\n🧪 Testing Character Processing:")
test_names = ['Nguyen', 'Papadopoulos', 'Sydney', 'Uluru']
for name in test_names:
    tensor = char_processor.text_to_tensor(name)
    reconstructed = char_processor.tensor_to_text(tensor)
    print(f"   '{name}' -> {tensor.tolist()} -> '{reconstructed}'")

# Show character vocabulary details
print(f"\n📚 Character Vocabulary Details:")
print(f"   Total characters: {char_processor.n_letters}")
print(f"   Character mapping sample: {dict(list(char_processor.letter_to_index.items())[:10])}")

print("\n✅ Character processing system ready!")

## 🗃️ PyTorch Dataset Implementation

We'll create a custom PyTorch Dataset class that handles our character-level data efficiently. This follows PyTorch best practices and enables easy integration with DataLoader for batching.

In [None]:
class AustralianNamesDataset(Dataset):
    """
    PyTorch Dataset for Australian names and locations with character-level processing.
    
    TensorFlow equivalent:
        tf.data.Dataset.from_tensor_slices((texts, labels))
    
    This dataset handles:
    - Character-level tokenization
    - Variable sequence lengths
    - Label encoding for classification
    """
    
    def __init__(self, text_data, char_processor, transform=None):
        """
        Initialize the dataset.
        
        Args:
            text_data: List of (text, label) tuples
            char_processor: CharacterProcessor instance
            transform: Optional data transformations
        """
        self.data = text_data
        self.char_processor = char_processor
        self.transform = transform
        
        # Extract unique labels and create label encoder
        unique_labels = list(set([label for _, label in text_data]))
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(unique_labels)
        
        self.num_classes = len(unique_labels)
        self.label_names = self.label_encoder.classes_
        
        print(f"📊 Dataset initialized:")
        print(f"   Samples: {len(self.data)}")
        print(f"   Classes: {self.num_classes}")
        print(f"   Labels: {list(self.label_names)}")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text, label = self.data[idx]
        
        # Convert text to character tensor
        char_tensor = self.char_processor.text_to_tensor(text)
        
        # Encode label
        label_encoded = torch.tensor(self.label_encoder.transform([label])[0], dtype=torch.long)
        
        if self.transform:
            char_tensor = self.transform(char_tensor)
            
        return char_tensor, label_encoded
    
    def get_label_name(self, encoded_label):
        """Convert encoded label back to name."""
        return self.label_encoder.inverse_transform([encoded_label])[0]

def collate_sequences(batch):
    """
    Custom collate function for variable-length sequences.
    
    PyTorch DataLoader requires fixed-size tensors, so we pad sequences
    to the maximum length in each batch.
    
    TensorFlow equivalent:
        tf.keras.preprocessing.sequence.pad_sequences()
    """
    sequences, labels = zip(*batch)
    
    # Find maximum sequence length in batch
    max_length = max(len(seq) for seq in sequences)
    
    # Pad sequences to max length
    padded_sequences = []
    sequence_lengths = []
    
    for seq in sequences:
        seq_len = len(seq)
        sequence_lengths.append(seq_len)
        
        # Pad with zeros (assuming 0 is a valid padding index)
        if seq_len < max_length:
            padding = torch.zeros(max_length - seq_len, dtype=torch.long)
            padded_seq = torch.cat([seq, padding])
        else:
            padded_seq = seq
            
        padded_sequences.append(padded_seq)
    
    # Stack into batch tensors
    sequences_tensor = torch.stack(padded_sequences)
    labels_tensor = torch.stack(labels)
    lengths_tensor = torch.tensor(sequence_lengths, dtype=torch.long)
    
    return sequences_tensor, labels_tensor, lengths_tensor

# Create datasets for names and locations separately
names_dataset = AustralianNamesDataset(names_data, char_processor)
locations_dataset = AustralianNamesDataset(locations_data, char_processor)

# Split names dataset for training/validation
train_names, val_names = train_test_split(names_data, test_size=0.2, random_state=42, 
                                        stratify=[label for _, label in names_data])

train_names_dataset = AustralianNamesDataset(train_names, char_processor)
val_names_dataset = AustralianNamesDataset(val_names, char_processor)

# Create data loaders
batch_size = 32 if DEVICE.type == 'cpu' else 64

train_loader = DataLoader(
    train_names_dataset, 
    batch_size=batch_size, 
    shuffle=True,
    collate_fn=collate_sequences,
    pin_memory=torch.cuda.is_available()
)

val_loader = DataLoader(
    val_names_dataset, 
    batch_size=batch_size, 
    shuffle=False,
    collate_fn=collate_sequences,
    pin_memory=torch.cuda.is_available()
)

print(f"\n📦 Data Loaders Created:")
print(f"   Training batches: {len(train_loader)}")
print(f"   Validation batches: {len(val_loader)}")
print(f"   Batch size: {batch_size}")

# Test data loading
sample_batch = next(iter(train_loader))
sequences, labels, lengths = sample_batch
print(f"\n🧪 Sample batch shapes:")
print(f"   Sequences: {sequences.shape}")
print(f"   Labels: {labels.shape}")
print(f"   Lengths: {lengths.shape}")
print(f"   Sample sequence length range: {lengths.min().item()}-{lengths.max().item()}")

## 🧠 Character-Level RNN Architecture

We'll implement a character-level RNN that processes names one character at a time. The model architecture includes:

1. **Character Embedding** - Convert one-hot characters to dense vectors
2. **RNN Layer** - Process character sequences (LSTM/GRU variants)
3. **Classification Head** - Map final hidden state to class predictions
4. **Attention Mechanism** - Optional attention over character sequence

In [None]:
class CharacterLevelRNN(nn.Module):
    """
    Character-level RNN for Australian name and location classification.
    
    TensorFlow equivalent:
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size, embed_dim),
            tf.keras.layers.LSTM(hidden_dim, return_sequences=False),
            tf.keras.layers.Dense(num_classes, activation='softmax')
        ])
    
    Key differences from TensorFlow:
    - Explicit forward pass definition
    - Manual hidden state initialization
    - Device management with .to(device)
    """
    
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, 
                 rnn_type='LSTM', num_layers=1, dropout=0.2, bidirectional=False):
        """
        Initialize the character-level RNN model.
        
        Args:
            vocab_size: Size of character vocabulary
            embed_dim: Embedding dimension
            hidden_dim: Hidden state dimension
            num_classes: Number of classification classes
            rnn_type: Type of RNN ('RNN', 'LSTM', 'GRU')
            num_layers: Number of RNN layers
            dropout: Dropout rate
            bidirectional: Whether to use bidirectional RNN
        """
        super(CharacterLevelRNN, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        # Embedding layer - maps character indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # RNN layer - choose between RNN, LSTM, and GRU
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(
                embed_dim, hidden_dim, num_layers,
                batch_first=True, dropout=dropout if num_layers > 1 else 0,
                bidirectional=bidirectional
            )
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(
                embed_dim, hidden_dim, num_layers,
                batch_first=True, dropout=dropout if num_layers > 1 else 0,
                bidirectional=bidirectional
            )
        else:  # Vanilla RNN
            self.rnn = nn.RNN(
                embed_dim, hidden_dim, num_layers,
                batch_first=True, dropout=dropout if num_layers > 1 else 0,
                bidirectional=bidirectional, nonlinearity='relu'
            )
        
        # Calculate final hidden dimension
        final_hidden_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(final_hidden_dim, final_hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(final_hidden_dim // 2, num_classes)
        )
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize model weights using Xavier/Glorot initialization."""
        for name, param in self.named_parameters():
            if 'weight' in name:
                if param.dim() > 1:
                    nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0)
    
    def forward(self, sequences, lengths=None):
        """
        Forward pass through the character-level RNN.
        
        Args:
            sequences: Input character sequences [batch_size, seq_len]
            lengths: Actual sequence lengths [batch_size]
            
        Returns:
            torch.Tensor: Class logits [batch_size, num_classes]
        """
        batch_size, seq_len = sequences.size()
        
        # Embedding lookup
        embedded = self.embedding(sequences)  # [batch_size, seq_len, embed_dim]
        
        # Apply dropout to embeddings
        embedded = self.dropout(embedded)
        
        # Pack sequences for efficient RNN processing (handles variable lengths)
        if lengths is not None:
            # Sort by length (required for packing)
            sorted_lengths, sorted_idx = torch.sort(lengths, descending=True)
            sorted_embedded = embedded[sorted_idx]
            
            # Pack the sequences
            packed_embedded = nn.utils.rnn.pack_padded_sequence(
                sorted_embedded, sorted_lengths.cpu(), batch_first=True
            )
            
            # RNN forward pass
            packed_output, hidden = self.rnn(packed_embedded)
            
            # Unpack the output (not needed for classification, but shown for completeness)
            # output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
            
            # Restore original order
            _, unsorted_idx = torch.sort(sorted_idx)
            
        else:
            # Simple forward pass without packing
            output, hidden = self.rnn(embedded)
            unsorted_idx = None
        
        # Extract final hidden state
        if self.rnn_type == 'LSTM':
            # For LSTM, hidden is (h_n, c_n), we want h_n
            final_hidden = hidden[0]  # [num_layers * num_directions, batch, hidden_dim]
        else:
            # For RNN and GRU
            final_hidden = hidden  # [num_layers * num_directions, batch, hidden_dim]
        
        # Take the last layer's hidden state
        if self.bidirectional:
            # Concatenate forward and backward hidden states
            final_hidden = torch.cat((final_hidden[-2], final_hidden[-1]), dim=1)
        else:
            final_hidden = final_hidden[-1]  # [batch, hidden_dim]
        
        # Restore original order if we sorted for packing
        if unsorted_idx is not None:
            final_hidden = final_hidden[unsorted_idx]
        
        # Classification
        logits = self.classifier(final_hidden)  # [batch_size, num_classes]
        
        return logits
    
    def predict(self, text, char_processor, device=None):
        """
        Predict class for a single text input.
        
        Args:
            text: Input text string
            char_processor: CharacterProcessor instance
            device: PyTorch device
            
        Returns:
            tuple: (predicted_class_idx, confidence_scores)
        """
        if device is None:
            device = next(self.parameters()).device
            
        self.eval()
        with torch.no_grad():
            # Convert text to tensor
            char_tensor = char_processor.text_to_tensor(text).unsqueeze(0).to(device)
            length_tensor = torch.tensor([len(text)], dtype=torch.long).to(device)
            
            # Forward pass
            logits = self.forward(char_tensor, length_tensor)
            probabilities = F.softmax(logits, dim=1)
            
            predicted_class = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0].cpu().numpy()
            
        return predicted_class, confidence

# Model configuration
model_config = {
    'vocab_size': char_processor.n_letters,
    'embed_dim': 64,
    'hidden_dim': 128,
    'num_classes': train_names_dataset.num_classes,
    'rnn_type': 'LSTM',  # Can be 'RNN', 'LSTM', or 'GRU'
    'num_layers': 2,
    'dropout': 0.3,
    'bidirectional': True
}

# Create model and move to device
model = CharacterLevelRNN(**model_config).to(DEVICE)

print(f"🧠 Character-Level RNN Model Created")
print("=" * 45)
print(f"   Architecture: {model_config['rnn_type']}")
print(f"   Vocabulary size: {model_config['vocab_size']}")
print(f"   Embedding dimension: {model_config['embed_dim']}")
print(f"   Hidden dimension: {model_config['hidden_dim']}")
print(f"   Number of classes: {model_config['num_classes']}")
print(f"   Layers: {model_config['num_layers']}")
print(f"   Bidirectional: {model_config['bidirectional']}")
print(f"   Device: {DEVICE}")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n📊 Model Parameters:")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")

# Test model with sample input
sample_sequences, sample_labels, sample_lengths = next(iter(train_loader))
sample_sequences = sample_sequences.to(DEVICE)
sample_lengths = sample_lengths.to(DEVICE)

with torch.no_grad():
    sample_output = model(sample_sequences, sample_lengths)
    print(f"\n🧪 Sample model output:")
    print(f"   Input shape: {sample_sequences.shape}")
    print(f"   Output shape: {sample_output.shape}")
    print(f"   Output range: [{sample_output.min().item():.3f}, {sample_output.max().item():.3f}]")

print("\n✅ Character-level RNN model ready for training!")