In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import itertools
import numpy as np
from typing import List, Tuple

In [2]:
class ParenthesisDataset:
    def __init__(self, num_samples: int = 10000):
        self.brackets = ['()', '[]', '{}']
        self.samples, self.labels = self.generate_dataset(num_samples)
    
    def generate_balanced_string(self, max_length: int = 20) -> str:
        """Generate a balanced parenthesis string."""
        possible_brackets = self.brackets
        length = random.randint(2, max_length)
        stack = []
        result = []
        
        while len(result) < length:
            if not stack or random.random() < 0.5:
                # Push an opening bracket
                bracket = random.choice(possible_brackets)[0]
                result.append(bracket)
                stack.append(bracket)
            else:
                # Pop a closing bracket
                opening = stack.pop()
                closing = {'(': ')', '[': ']', '{': '}'}[opening]
                result.append(closing)
        
        # Ensure all brackets are closed
        while stack:
            opening = stack.pop()
            closing = {'(': ')', '[': ']', '{': '}'}[opening]
            result.append(closing)
        
        return ''.join(result)
    
    def generate_unbalanced_string(self, max_length: int = 20) -> str:
        """Generate an unbalanced parenthesis string."""
        if random.random() < 0.5:
            # Mismatched brackets
            base_balanced = self.generate_balanced_string(max_length)
            mis_index = random.randint(0, len(base_balanced) - 1)
            base_list = list(base_balanced)
            base_list[mis_index] = random.choice(['(', ')', '[', ']', '{', '}'])
            return ''.join(base_list)
        else:
            # Unbalanced brackets
            base = list(self.generate_balanced_string(max_length))
            # Remove some closing or opening brackets
            remove_count = random.randint(1, len(base) // 2)
            for _ in range(remove_count):
                remove_index = random.randint(0, len(base) - 1)
                base.pop(remove_index)
            return ''.join(base)
    
    def generate_dataset(self, num_samples: int) -> Tuple[List[str], List[int]]:
        """Generate balanced and unbalanced dataset."""
        samples = []
        labels = []
        
        for _ in range(num_samples // 2):
            samples.append(self.generate_balanced_string())
            labels.append(1)  # Balanced
            
            samples.append(self.generate_unbalanced_string())
            labels.append(0)  # Unbalanced
        
        return samples, labels

In [3]:
class ParenthesisTokenizer:
    def __init__(self):
        self.special_tokens = {
            'PAD': 0,
            'START': 1,
            'END': 2
        }
        self.token_to_index = {
            '(': 3, ')': 4,
            '[': 5, ']': 6,
            '{': 7, '}': 8
        }
        self.index_to_token = {v: k for k, v in self.token_to_index.items()}
        self.vocab_size = len(self.special_tokens) + len(self.token_to_index)
    
    def encode(self, text: str, max_length: int = 50) -> List[int]:
        """Tokenize a string with START and END tokens and padding."""
        tokens = [self.special_tokens['START']]
        tokens.extend([self.token_to_index[char] for char in text])
        tokens.append(self.special_tokens['END'])
        
        # Pad or truncate
        if len(tokens) > max_length:
            tokens = tokens[:max_length]
        else:
            tokens += [self.special_tokens['PAD']] * (max_length - len(tokens))
        
        return tokens
    
    def create_mask(self, tokens: List[int]) -> torch.Tensor:
        """Create an attention mask that ignores padding tokens."""
        return torch.tensor([1 if token != self.special_tokens['PAD'] else 0 for token in tokens])

In [4]:
class TransformerParenthesisClassifier(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int = 64, 
                 num_heads: int = 4, num_layers: int = 2):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, 
            nhead=num_heads,
            dropout=0.1
        )
        
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, 
            num_layers=num_layers
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x = x.permute(1, 0, 2)  # (seq_len, batch, embedding_dim)
        
        # Apply transformer encoder with attention mask
        x = self.transformer_encoder(x, src_key_padding_mask=mask == 0)
        
        # Global average pooling
        x = x.mean(dim=0)
        
        return self.classifier(x)

In [5]:
def train_model(model, dataset, tokenizer, batch_size=32, epochs=10):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    
    for epoch in range(epochs):
        total_loss = 0
        
        # Shuffle dataset
        indices = list(range(len(dataset.samples)))
        random.shuffle(indices)
        
        for i in range(0, len(indices), batch_size):
            batch_indices = indices[i:i+batch_size]
            
            # Prepare batch
            batch_samples = [dataset.samples[j] for j in batch_indices]
            batch_labels = [dataset.labels[j] for j in batch_indices]
            
            # Tokenize and create tensors
            batch_tokens = [tokenizer.encode(sample) for sample in batch_samples]
            batch_masks = [tokenizer.create_mask(tokens) for tokens in batch_tokens]
            
            batch_tokens = torch.tensor(batch_tokens)
            batch_masks = torch.tensor(batch_masks)
            batch_labels = torch.tensor(batch_labels, dtype=torch.float32)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(batch_tokens, batch_masks).squeeze()
            loss = criterion(outputs, batch_labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(indices)}")
    
    return model

def evaluate_model(model, dataset, tokenizer):
    model.eval()
    correct = 0
    total = len(dataset.samples)
    
    with torch.no_grad():
        for sample, label in zip(dataset.samples, dataset.labels):
            tokens = torch.tensor(tokenizer.encode(sample)).unsqueeze(0)
            mask = tokenizer.create_mask(tokenizer.encode(sample)).unsqueeze(0)
            
            output = model(tokens, mask).squeeze().item()
            prediction = 1 if output > 0.5 else 0
            
            if prediction == label:
                correct += 1
    
    accuracy = correct / total
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    return model

In [6]:
torch.manual_seed(42)
random.seed(42)

# Create Dataset
dataset = ParenthesisDataset(num_samples=10000)

# Create Tokenizer
tokenizer = ParenthesisTokenizer()

# Create Model
model = TransformerParenthesisClassifier(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=64,
    num_heads=4,
    num_layers=2
)

# Train Model
trained_model = train_model(model, dataset, tokenizer)

# Evaluate Model
evaluated_model = evaluate_model(trained_model, dataset, tokenizer)



TypeError: only integer tensors of a single element can be converted to an index