In [2]:
# ===== URDU CONVERSATIONAL CHATBOT WITH SPAN CORRUPTION =====
# Complete Implementation for Question-Answering Task

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
import random
import math
from typing import List, Tuple
from collections import Counter
import pickle

# ==================== DATA PREPROCESSING ====================

class UrduTextPreprocessor:
    """Preprocess and normalize Urdu text"""
    
    @staticmethod
    def normalize_urdu(text: str) -> str:
        """Normalize Urdu text by removing diacritics and standardizing characters"""
        if not isinstance(text, str):
            return ""
        
        # Remove diacritics (zabar, zer, pesh, etc.)
        text = re.sub(r'[\u064B-\u065F]', '', text)
        
        # Standardize Alef forms
        text = text.replace('ٱ', 'ا').replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا')
        
        # Standardize Yeh forms
        text = text.replace('ى', 'ی').replace('ئ', 'ی')
        
        # Standardize Heh forms
        text = text.replace('ۃ', 'ہ').replace('ھ', 'ہ')
        
        # Remove extra whitespaces
        text = ' '.join(text.split())
        
        return text.strip()
    
    @staticmethod
    def span_corruption(sentence: str, mask_ratio: float = 0.15, 
                       max_span_length: int = 3) -> Tuple[str, str]:
        """
        Apply span corruption to create input-output pairs for QA task
        
        Args:
            sentence: Original Urdu sentence
            mask_ratio: Percentage of tokens to mask (default 15%)
            max_span_length: Maximum consecutive tokens to mask
            
        Returns:
            Tuple of (corrupted_text, original_text)
        """
        words = sentence.split()
        
        if len(words) < 3:
            return None, None
        
        # Calculate number of spans to mask
        num_tokens_to_mask = max(1, int(len(words) * mask_ratio))
        
        corrupted = words.copy()
        masked_positions = set()
        
        attempts = 0
        while len(masked_positions) < num_tokens_to_mask and attempts < 50:
            # Random span length (1 to max_span_length)
            span_len = random.randint(1, min(max_span_length, len(words)))
            
            # Random start position
            if len(words) < span_len:
                break
                
            start_idx = random.randint(0, len(words) - span_len)
            
            # Check if positions are not already masked
            new_positions = set(range(start_idx, start_idx + span_len))
            if not new_positions.intersection(masked_positions):
                for i in new_positions:
                    corrupted[i] = '[MASK]'
                    masked_positions.add(i)
            
            attempts += 1
        
        # Clean up consecutive [MASK] tokens
        cleaned = []
        prev_mask = False
        for token in corrupted:
            if token == '[MASK]':
                if not prev_mask:
                    cleaned.append(token)
                prev_mask = True
            else:
                cleaned.append(token)
                prev_mask = False
        
        return ' '.join(cleaned), sentence


class Vocabulary:
    """Build and manage vocabulary for Urdu text"""
    
    def __init__(self, min_freq: int = 2):
        self.word2idx = {
            '<PAD>': 0,
            '<SOS>': 1,
            '<EOS>': 2,
            '<UNK>': 3,
            '[MASK]': 4
        }
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        self.word_count = Counter()
        self.min_freq = min_freq
        self.n_words = 5
    
    def add_sentence(self, sentence: str):
        """Add all words in sentence to vocabulary"""
        for word in sentence.split():
            self.word_count[word] += 1
    
    def build_vocab(self):
        """Build final vocabulary based on word frequency"""
        for word, count in self.word_count.items():
            if count >= self.min_freq and word not in self.word2idx:
                self.word2idx[word] = self.n_words
                self.idx2word[self.n_words] = word
                self.n_words += 1
    
    def encode(self, sentence: str, max_len: int = None) -> List[int]:
        """Convert sentence to indices"""
        tokens = [self.word2idx.get(word, self.word2idx['<UNK>']) 
                 for word in sentence.split()]
        
        if max_len:
            tokens = tokens[:max_len]
            
        return [self.word2idx['<SOS>']] + tokens + [self.word2idx['<EOS>']]
    
    def decode(self, indices: List[int]) -> str:
        """Convert indices back to sentence"""
        words = []
        for idx in indices:
            if idx == self.word2idx['<EOS>']:
                break
            if idx not in [self.word2idx['<PAD>'], self.word2idx['<SOS>']]:
                words.append(self.idx2word.get(idx, '<UNK>'))
        return ' '.join(words)


class UrduQADataset(Dataset):
    """Dataset class for Urdu QA with span corruption"""
    
    def __init__(self, data_pairs: List[Tuple[str, str]], vocab: Vocabulary, 
                 max_len: int = 50):
        self.data = data_pairs
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        
        src_indices = self.vocab.encode(src, self.max_len)
        tgt_indices = self.vocab.encode(tgt, self.max_len)
        
        # Pad sequences
        src_padded = src_indices + [0] * (self.max_len + 2 - len(src_indices))
        tgt_padded = tgt_indices + [0] * (self.max_len + 2 - len(tgt_indices))
        
        return torch.tensor(src_padded[:self.max_len + 2]), \
               torch.tensor(tgt_padded[:self.max_len + 2])


# ==================== MODEL ARCHITECTURE ====================

class PositionalEncoding(nn.Module):
    """Positional encoding for transformer"""
    
    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


class MultiHeadAttention(nn.Module):
    """Multi-head attention mechanism"""
    
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # Linear layers for Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
    
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        """Calculate scaled dot-product attention"""
        # Q, K, V: (batch_size, num_heads, seq_len, d_k)
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        attn_probs = torch.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)
        
        output = torch.matmul(attn_probs, V)
        return output, attn_probs
    
    def split_heads(self, x):
        """Split into multiple heads"""
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
    
    def combine_heads(self, x):
        """Combine multiple heads"""
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
    
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Apply attention
        attn_output, attn_probs = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output linear
        output = self.W_o(self.combine_heads(attn_output))
        return output, attn_probs


class FeedForward(nn.Module):
    """Position-wise feed-forward network"""
    
    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


class EncoderLayer(nn.Module):
    """Single transformer encoder layer"""
    
    def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # Self-attention with residual connection
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x


class DecoderLayer(nn.Module):
    """Single transformer decoder layer"""
    
    def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # Masked self-attention
        attn_output, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Cross-attention with encoder output
        attn_output, _ = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        # Feed-forward
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        return x


class Transformer(nn.Module):
    """Complete Transformer model for Urdu QA"""
    
    def __init__(self, vocab_size: int, d_model: int = 256, num_heads: int = 2,
                 num_encoder_layers: int = 2, num_decoder_layers: int = 2,
                 d_ff: int = 1024, dropout: float = 0.1, max_len: int = 52):
        super().__init__()
        
        self.d_model = d_model
        self.vocab_size = vocab_size
        
        # Embeddings
        self.encoder_embedding = nn.Embedding(vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(vocab_size, d_model)
        
        # Positional encoding
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout)
        
        # Encoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])
        
        # Decoder layers
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        # Output projection
        self.output_linear = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize model weights"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def create_padding_mask(self, seq):
        """Create mask for padding tokens"""
        return (seq != 0).unsqueeze(1).unsqueeze(2)
    
    def create_look_ahead_mask(self, size):
        """Create mask to prevent attending to future tokens"""
        mask = torch.triu(torch.ones(size, size), diagonal=1).type(torch.uint8)
        return mask == 0
    
    def encode(self, src, src_mask):
        """Encode source sequence"""
        # Embedding + positional encoding
        x = self.encoder_embedding(src) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Pass through encoder layers
        for layer in self.encoder_layers:
            x = layer(x, src_mask)
        
        return x
    
    def decode(self, tgt, enc_output, src_mask, tgt_mask):
        """Decode target sequence"""
        # Embedding + positional encoding
        x = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Pass through decoder layers
        for layer in self.decoder_layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        
        return x
    
    def forward(self, src, tgt):
        """Forward pass"""
        # Create masks
        src_mask = self.create_padding_mask(src)
        tgt_mask = self.create_padding_mask(tgt) & \
                   self.create_look_ahead_mask(tgt.size(1)).to(tgt.device)
        
        # Encode and decode
        enc_output = self.encode(src, src_mask)
        dec_output = self.decode(tgt, enc_output, src_mask, tgt_mask)
        
        # Project to vocabulary
        output = self.output_linear(dec_output)
        return output
    
    def generate(self, src, vocab, max_len=50, strategy='greedy', beam_size=3):
        """Generate response using greedy or beam search"""
        self.eval()
        device = src.device
        
        with torch.no_grad():
            # Encode source
            src_mask = self.create_padding_mask(src)
            enc_output = self.encode(src, src_mask)
            
            # Initialize decoder input
            tgt = torch.tensor([[vocab.word2idx['<SOS>']]]).to(device)
            
            if strategy == 'greedy':
                return self._greedy_decode(tgt, enc_output, src_mask, vocab, max_len)
            else:
                return self._beam_search(tgt, enc_output, src_mask, vocab, max_len, beam_size)
    
    def _greedy_decode(self, tgt, enc_output, src_mask, vocab, max_len):
        """Greedy decoding"""
        for _ in range(max_len):
            tgt_mask = self.create_padding_mask(tgt) & \
                      self.create_look_ahead_mask(tgt.size(1)).to(tgt.device)
            
            dec_output = self.decode(tgt, enc_output, src_mask, tgt_mask)
            output = self.output_linear(dec_output)
            
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(0)
            tgt = torch.cat([tgt, next_token], dim=1)
            
            if next_token.item() == vocab.word2idx['<EOS>']:
                break
        
        return tgt.squeeze(0).tolist()


# ==================== TRAINING ====================

def train_epoch(model, dataloader, optimizer, criterion, device, clip=1.0):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch_idx, (src, tgt) in enumerate(dataloader):
        src, tgt = src.to(device), tgt.to(device)
        
        # Prepare decoder input and output (teacher forcing)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(src, tgt_input)
        
        # Reshape for loss calculation
        output = output.reshape(-1, output.shape[-1])
        tgt_output = tgt_output.reshape(-1)
        
        # Calculate loss
        loss = criterion(output, tgt_output)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        total_loss += loss.item()
        
        if batch_idx % 50 == 0:
            print(f'  Batch {batch_idx}/{len(dataloader)}, Loss: {loss.item():.4f}')
    
    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    """Evaluate model"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            output = model(src, tgt_input)
            output = output.reshape(-1, output.shape[-1])
            tgt_output = tgt_output.reshape(-1)
            
            loss = criterion(output, tgt_output)
            total_loss += loss.item()
    
    return total_loss / len(dataloader)


# ==================== MAIN PIPELINE ====================

def prepare_data(tsv_path: str, test_mode=False):
    """Load and prepare dataset"""
    print("Loading dataset...")
    df = pd.read_csv(tsv_path, sep='\t')
    
    # Extract sentences from column C
    sentences = df['sentence'].dropna().tolist()
    
    if test_mode:
        sentences = sentences[:1000]  # Use subset for testing
    
    print(f"Total sentences: {len(sentences)}")
    
    # Normalize text
    preprocessor = UrduTextPreprocessor()
    sentences = [preprocessor.normalize_urdu(s) for s in sentences if s.strip()]
    
    # Apply span corruption
    print("Applying span corruption...")
    data_pairs = []
    for sent in sentences:
        corrupted, original = preprocessor.span_corruption(sent, mask_ratio=0.15)
        if corrupted and original:
            data_pairs.append((corrupted, original))
    
    print(f"Created {len(data_pairs)} training pairs")
    
    # Build vocabulary
    print("Building vocabulary...")
    vocab = Vocabulary(min_freq=2)
    for src, tgt in data_pairs:
        vocab.add_sentence(src)
        vocab.add_sentence(tgt)
    vocab.build_vocab()
    
    print(f"Vocabulary size: {vocab.n_words}")
    
    # Split data
    split1 = int(0.8 * len(data_pairs))
    split2 = int(0.9 * len(data_pairs))
    
    train_data = data_pairs[:split1]
    val_data = data_pairs[split1:split2]
    test_data = data_pairs[split2:]
    
    print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")
    
    return train_data, val_data, test_data, vocab


def main():
    """Main training pipeline"""
    # Hyperparameters
    BATCH_SIZE = 32
    EPOCHS = 20
    D_MODEL = 256
    NUM_HEADS = 2
    NUM_ENCODER_LAYERS = 2
    NUM_DECODER_LAYERS = 2
    D_FF = 1024
    DROPOUT = 0.1
    LEARNING_RATE = 3e-4
    MAX_LEN = 50
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Prepare data
    train_data, val_data, test_data, vocab = prepare_data('data/final_main_dataset.tsv', test_mode=False)
    
    # Create datasets
    train_dataset = UrduQADataset(train_data, vocab, MAX_LEN)
    val_dataset = UrduQADataset(val_data, vocab, MAX_LEN)
    test_dataset = UrduQADataset(test_data, vocab, MAX_LEN)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    # Initialize model
    model = Transformer(
        vocab_size=vocab.n_words,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        num_encoder_layers=NUM_ENCODER_LAYERS,
        num_decoder_layers=NUM_DECODER_LAYERS,
        d_ff=D_FF,
        dropout=DROPOUT,
        max_len=MAX_LEN + 2
    ).to(device)
    
    print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Optimizer and loss
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
    
    # Training loop
    best_val_loss = float('inf')
    
    print("\n" + "="*50)
    print("Starting Training...")
    print("="*50 + "\n")
    
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
            }, 'best_urdu_chatbot.pt')
            print("✓ Model saved!")
        
        print()
    
    # Save vocabulary
    with open('vocab.pkl', 'wb') as f:
        pickle.dump(vocab, f)
    
    print("\n" + "="*50)
    print("Training Complete!")
    print("="*50)


if __name__ == "__main__":
    main()

Using device: cpu
Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: 'data/final_main_dataset.tsv'