## 1Ô∏è‚É£ System-Level Optimization

Set optimal environment variables **before** importing PyTorch.

In [1]:
import os

# Optimize PyTorch/CUDA performance
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['MKL_NUM_THREADS'] = '4'
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
os.environ['TORCH_CUDNN_V8_API_ENABLED'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

print("‚úÖ Environment variables optimized")

‚úÖ Environment variables optimized


## 2Ô∏è‚É£ Imports & Setup

In [2]:
import re
import logging
import warnings
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, Sampler
from torch.cuda.amp import autocast
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import json
import random

warnings.filterwarnings('ignore')

# Check GPU
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.8.0+cu126
CUDA available: False


## 3Ô∏è‚É£ Configuration

**üìù EDIT THESE PATHS:**

In [3]:
@dataclass
class UltraConfig:
    """Ultra-optimized configuration"""
    
    # ============ PATHS - EDIT THESE ============
    test_data_path: str = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
    model_path: str = "/kaggle/input/final-byt5/byt5-akkadian-optimized-34x"
    output_dir: str = "/kaggle/working/"
    
    # ============ PROCESSING ============
    max_length: int = 512
    batch_size: int = 8  # Will auto-tune if use_auto_batch_size=True
    num_workers: int = 4  # Increased for better throughput
    
    # ============ GENERATION ============
    num_beams: int = 8
    max_new_tokens: int = 512
    length_penalty: float = 1.5
    repetition_penalty: float = 1.2
    early_stopping: bool = True
    no_repeat_ngram_size: int = 0  # Set to 3 if you see repetition
    
    # ============ OPTIMIZATIONS ============
    use_mixed_precision: bool = True      # FP16 for 2x speedup
    use_better_transformer: bool = True   # 20-50% speedup
    use_bucket_batching: bool = True      # 20-40% less padding
    use_vectorized_postproc: bool = True  # 3-5x faster postproc
    use_adaptive_beams: bool = True       # Smart beam allocation
    use_auto_batch_size: bool = False     # Auto-find optimal batch size
    
    # ============ OTHER ============
    aggressive_postprocessing: bool = True
    checkpoint_freq: int = 100
    num_buckets: int = 4  # For bucket batching
    
    def __post_init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        Path(self.output_dir).mkdir(exist_ok=True, parents=True)
        
        if not torch.cuda.is_available():
            self.use_mixed_precision = False
            self.use_better_transformer = False

# Create config
config = UltraConfig()

print("\nüìã Configuration:")
print(f"  Device: {config.device}")
print(f"  Batch size: {config.batch_size}")
print(f"  Beams: {config.num_beams}")
print(f"\nüöÄ Optimizations:")
print(f"  Mixed Precision: {config.use_mixed_precision}")
print(f"  BetterTransformer: {config.use_better_transformer}")
print(f"  Bucket Batching: {config.use_bucket_batching}")
print(f"  Vectorized Postproc: {config.use_vectorized_postproc}")
print(f"  Adaptive Beams: {config.use_adaptive_beams}")


üìã Configuration:
  Device: cpu
  Batch size: 8
  Beams: 8

üöÄ Optimizations:
  Mixed Precision: False
  BetterTransformer: False
  Bucket Batching: True
  Vectorized Postproc: True
  Adaptive Beams: True


## 4Ô∏è‚É£ Logging Setup

In [4]:
def setup_logging(output_dir: str = './outputs'):
    """Setup logging"""
    Path(output_dir).mkdir(exist_ok=True, parents=True)
    log_file = Path(output_dir) / 'inference_ultra.log'
    
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler(log_file)
        ]
    )
    return logging.getLogger(__name__)

logger = setup_logging(config.output_dir)
logger.info("Logging initialized")

2026-02-02 12:45:52,061 - INFO - Logging initialized


## 5Ô∏è‚É£ Optimized Text Preprocessor

Uses pre-compiled regex patterns for speed.

In [5]:
class OptimizedPreprocessor:
    """Preprocessor with pre-compiled patterns"""
    
    def __init__(self):
        # Pre-compile regex patterns (20-30% faster)
        self.patterns = {
            'big_gap': re.compile(r'(\.{3,}|‚Ä¶+|‚Ä¶‚Ä¶)'),
            'small_gap': re.compile(r'(xx+|\s+x\s+)'),
        }
    
    def preprocess_input_text(self, text: str) -> str:
        """Single text preprocessing"""
        if pd.isna(text):
            return ""
        
        text = str(text)
        text = self.patterns['big_gap'].sub('<big_gap>', text)
        text = self.patterns['small_gap'].sub('<gap>', text)
        
        return text
    
    def preprocess_batch(self, texts: List[str]) -> List[str]:
        """Vectorized batch preprocessing (faster)"""
        s = pd.Series(texts).fillna("")
        s = s.astype(str)
        s = s.str.replace(self.patterns['big_gap'], '<big_gap>', regex=True)
        s = s.str.replace(self.patterns['small_gap'], '<gap>', regex=True)
        return s.tolist()

# Test
preprocessor = OptimizedPreprocessor()
test = "lugal ... xxx mu.2.kam"
print(f"Test input:  {test}")
print(f"Preprocessed: {preprocessor.preprocess_input_text(test)}")

Test input:  lugal ... xxx mu.2.kam
Preprocessed: lugal <big_gap> <gap> mu.2.kam


## 6Ô∏è‚É£ Vectorized Postprocessor

Uses pandas for batch operations ‚Üí **3-5x faster** than loop-based postprocessing.

In [6]:
class VectorizedPostprocessor:
    """Ultra-fast vectorized postprocessing"""
    
    def __init__(self, aggressive: bool = True):
        self.aggressive = aggressive
        
        # Pre-compile ALL patterns
        self.patterns = {
            'gap': re.compile(r'(\[x\]|\(x\)|\bx\b)', re.I),
            'big_gap': re.compile(r'(\.{3,}|‚Ä¶|\[\.+\])'),
            'annotations': re.compile(r'\((fem|plur|pl|sing|singular|plural|\?|!)\..\s*\w*\)', re.I),
            'repeated_words': re.compile(r'\b(\w+)(?:\s+\1\b)+'),
            'whitespace': re.compile(r'\s+'),
            'punct_space': re.compile(r'\s+([.,:])'),
            'repeated_punct': re.compile(r'([.,])\1+'),
        }
        
        # Character translation tables
        self.subscript_trans = str.maketrans("‚ÇÄ‚ÇÅ‚ÇÇ‚ÇÉ‚ÇÑ‚ÇÖ‚ÇÜ‚Çá‚Çà‚Çâ", "0123456789")
        self.special_chars_trans = str.maketrans('·∏´·∏™', 'hH')
        self.forbidden_chars = '!?()"‚Äî‚Äî<>‚åà‚åã‚åä[]+ æ/;'
        self.forbidden_trans = str.maketrans('', '', self.forbidden_chars)
    
    def postprocess_batch(self, translations: List[str]) -> List[str]:
        """Vectorized batch postprocessing - 3-5x faster than loop"""
        
        # Convert to Series for vectorized operations
        s = pd.Series(translations)
        
        # Filter invalid entries
        valid_mask = s.apply(lambda x: isinstance(x, str) and x.strip())
        if not valid_mask.all():
            s[~valid_mask] = ""
        
        # Basic cleaning (always applied)
        s = s.str.translate(self.special_chars_trans)
        s = s.str.translate(self.subscript_trans)
        s = s.str.replace(self.patterns['whitespace'], ' ', regex=True)
        s = s.str.strip()
        
        if self.aggressive:
            # Normalize gaps
            s = s.str.replace(self.patterns['gap'], '<gap>', regex=True)
            s = s.str.replace(self.patterns['big_gap'], '<big_gap>', regex=True)
            
            # Merge adjacent gaps
            s = s.str.replace('<gap> <gap>', '<big_gap>', regex=False)
            s = s.str.replace('<big_gap> <big_gap>', '<big_gap>', regex=False)
            
            # Remove annotations
            s = s.str.replace(self.patterns['annotations'], '', regex=True)
            
            # Protect gaps during char removal
            s = s.str.replace('<gap>', '\x00GAP\x00', regex=False)
            s = s.str.replace('<big_gap>', '\x00BIG\x00', regex=False)
            
            # Remove forbidden characters
            s = s.str.translate(self.forbidden_trans)
            
            # Restore gaps
            s = s.str.replace('\x00GAP\x00', ' <gap> ', regex=False)
            s = s.str.replace('\x00BIG\x00', ' <big_gap> ', regex=False)
            
            # Fractions (vectorized)
            s = s.str.replace(r'(\d+)\.5\b', r'\1¬Ω', regex=True)
            s = s.str.replace(r'\b0\.5\b', '¬Ω', regex=True)
            s = s.str.replace(r'(\d+)\.25\b', r'\1¬º', regex=True)
            s = s.str.replace(r'\b0\.25\b', '¬º', regex=True)
            s = s.str.replace(r'(\d+)\.75\b', r'\1¬æ', regex=True)
            s = s.str.replace(r'\b0\.75\b', '¬æ', regex=True)
            
            # Remove repeated words
            s = s.str.replace(self.patterns['repeated_words'], r'\1', regex=True)
            
            # Remove repeated n-grams
            for n in range(4, 1, -1):
                pattern = r'\b((?:\w+\s+){' + str(n-1) + r'}\w+)(?:\s+\1\b)+'
                s = s.str.replace(pattern, r'\1', regex=True)
            
            # Fix punctuation
            s = s.str.replace(self.patterns['punct_space'], r'\1', regex=True)
            s = s.str.replace(self.patterns['repeated_punct'], r'\1', regex=True)
            
            # Final cleanup
            s = s.str.replace(self.patterns['whitespace'], ' ', regex=True)
            s = s.str.strip().str.strip('-').str.strip()
        
        return s.tolist()

# Test
postprocessor = VectorizedPostprocessor(aggressive=config.aggressive_postprocessing)
test_outputs = [
    "The king (plur.) took the city... [x] [x]",
    "He spoke spoke to the assembly"
]
cleaned = postprocessor.postprocess_batch(test_outputs)
print("Test postprocessing:")
for orig, clean in zip(test_outputs, cleaned):
    print(f"  {orig}")
    print(f"  ‚Üí {clean}")

Test postprocessing:
  The king (plur.) took the city... [x] [x]
  ‚Üí The king plur. took the city <big_gap>
  He spoke spoke to the assembly
  ‚Üí He spoke to the assembly


## 7Ô∏è‚É£ Bucket Batch Sampler

Groups samples by length to minimize padding ‚Üí **20-40% faster**.

In [7]:
class BucketBatchSampler(Sampler):
    """Batch samples by similar length to minimize padding"""
    
    def __init__(self, dataset, batch_size: int, num_buckets: int = 4, shuffle: bool = False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        # Calculate lengths
        lengths = [len(text.split()) for _, text in dataset]
        
        # Sort indices by length
        sorted_indices = sorted(range(len(lengths)), key=lambda i: lengths[i])
        
        # Create buckets
        bucket_size = len(sorted_indices) // num_buckets
        self.buckets = []
        for i in range(num_buckets):
            start = i * bucket_size
            end = None if i == num_buckets - 1 else (i + 1) * bucket_size
            self.buckets.append(sorted_indices[start:end])
        
        # Log bucket info
        logger.info(f"Created {num_buckets} buckets:")
        for i, bucket in enumerate(self.buckets):
            bucket_lengths = [lengths[idx] for idx in bucket]
            logger.info(f"  Bucket {i}: {len(bucket)} samples, "
                       f"length range [{min(bucket_lengths)}, {max(bucket_lengths)}]")
    
    def __iter__(self):
        for bucket in self.buckets:
            if self.shuffle:
                random.shuffle(bucket)
            
            for i in range(0, len(bucket), self.batch_size):
                yield bucket[i:i+self.batch_size]
    
    def __len__(self):
        return sum((len(b) + self.batch_size - 1) // self.batch_size for b in self.buckets)

## 8Ô∏è‚É£ Dataset Class

In [8]:
class AkkadianDataset(Dataset):
    """Optimized dataset with batch preprocessing"""
    
    def __init__(self, dataframe: pd.DataFrame, preprocessor: OptimizedPreprocessor):
        self.sample_ids = dataframe['id'].tolist()
        
        # Batch preprocess (faster than loop)
        raw_texts = dataframe['transliteration'].tolist()
        preprocessed = preprocessor.preprocess_batch(raw_texts)
        
        # Add task prefix
        self.input_texts = [
            "translate Akkadian to English: " + text
            for text in preprocessed
        ]
        
        logger.info(f"Dataset created with {len(self.sample_ids)} samples")
    
    def __len__(self):
        return len(self.sample_ids)
    
    def __getitem__(self, index: int):
        return self.sample_ids[index], self.input_texts[index]

## 9Ô∏è‚É£ Ultra-Optimized Inference Engine

Main inference engine with all optimizations.

In [9]:
class UltraInferenceEngine:
    """Ultra-optimized inference engine"""
    
    def __init__(self, config: UltraConfig):
        self.config = config
        self.preprocessor = OptimizedPreprocessor()
        self.postprocessor = VectorizedPostprocessor(aggressive=config.aggressive_postprocessing)
        self.results = []
        
        # Load model
        self._load_model()
    
    def _load_model(self):
        """Load and optimize model"""
        logger.info(f"Loading model from {self.config.model_path}")
        
        try:
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                self.config.model_path
            ).to(self.config.device).eval()
            
            self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_path)
            
            num_params = sum(p.numel() for p in self.model.parameters())
            logger.info(f"Model loaded: {num_params:,} parameters")
            
            # Apply BetterTransformer
            if self.config.use_better_transformer and torch.cuda.is_available():
                try:
                    from optimum.bettertransformer import BetterTransformer
                    logger.info("Applying BetterTransformer...")
                    self.model = BetterTransformer.transform(self.model)
                    logger.info("‚úÖ BetterTransformer applied (20-50% speedup)")
                except ImportError:
                    logger.warning("‚ö†Ô∏è  'optimum' not installed, skipping BetterTransformer")
                    logger.warning("   Install with: !pip install optimum")
                except Exception as e:
                    logger.warning(f"‚ö†Ô∏è  BetterTransformer failed: {e}")
            
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            raise
    
    def _collate_fn(self, batch_samples):
        """Collate function"""
        batch_ids = [s[0] for s in batch_samples]
        batch_texts = [s[1] for s in batch_samples]
        
        tokenized = self.tokenizer(
            batch_texts,
            max_length=self.config.max_length,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        
        return batch_ids, tokenized
    
    def find_optimal_batch_size(self, dataset, start_bs: int = 32):
        """Binary search for optimal batch size"""
        logger.info("üîç Finding optimal batch size...")
        
        max_bs = start_bs
        min_bs = 1
        
        while max_bs - min_bs > 1:
            test_bs = (max_bs + min_bs) // 2
            
            try:
                test_batch = [dataset[i] for i in range(min(test_bs, len(dataset)))]
                ids, inputs = self._collate_fn(test_batch)
                
                with torch.inference_mode():
                    if self.config.use_mixed_precision:
                        with autocast():
                            outputs = self.model.generate(
                                input_ids=inputs.input_ids.to(self.config.device),
                                attention_mask=inputs.attention_mask.to(self.config.device),
                                num_beams=self.config.num_beams,
                                max_new_tokens=64,
                                use_cache=True
                            )
                    else:
                        outputs = self.model.generate(
                            input_ids=inputs.input_ids.to(self.config.device),
                            attention_mask=inputs.attention_mask.to(self.config.device),
                            num_beams=self.config.num_beams,
                            max_new_tokens=64,
                            use_cache=True
                        )
                
                min_bs = test_bs
                logger.info(f"  ‚úÖ Batch size {test_bs} works")
                
                del outputs, inputs
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    max_bs = test_bs
                    logger.info(f"  ‚ùå Batch size {test_bs} OOM")
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                else:
                    raise
        
        optimal = min_bs
        logger.info(f"üéØ Optimal batch size: {optimal}")
        return optimal
    
    def _get_adaptive_beam_size(self, input_ids, attention_mask):
        """Adaptive beam size based on complexity"""
        if not self.config.use_adaptive_beams:
            return self.config.num_beams
        
        lengths = attention_mask.sum(dim=1)
        
        # Short ‚Üí fewer beams, Long ‚Üí more beams
        beam_sizes = torch.where(
            lengths < 100,
            torch.tensor(max(4, self.config.num_beams // 2)),
            torch.tensor(self.config.num_beams)
        )
        
        return beam_sizes[0].item()
    
    def _save_checkpoint(self):
        """Save checkpoint"""
        if len(self.results) > 0 and len(self.results) % self.config.checkpoint_freq == 0:
            path = Path(self.config.output_dir) / f"checkpoint_{len(self.results)}.csv"
            df = pd.DataFrame(self.results, columns=['id', 'translation'])
            df.to_csv(path, index=False)
            logger.info(f"üíæ Checkpoint: {len(self.results)} translations")
    
    def run_inference(self, test_df: pd.DataFrame) -> pd.DataFrame:
        """Run ultra-optimized inference"""
        logger.info("üöÄ Starting ULTRA-OPTIMIZED inference")
        
        # Create dataset
        dataset = AkkadianDataset(test_df, self.preprocessor)
        
        # Auto-find batch size
        if self.config.use_auto_batch_size:
            optimal_bs = self.find_optimal_batch_size(dataset)
            self.config.batch_size = optimal_bs
        
        # Create dataloader
        if self.config.use_bucket_batching:
            batch_sampler = BucketBatchSampler(
                dataset, 
                self.config.batch_size,
                num_buckets=self.config.num_buckets
            )
            dataloader = DataLoader(
                dataset,
                batch_sampler=batch_sampler,
                num_workers=self.config.num_workers,
                collate_fn=self._collate_fn,
                pin_memory=True,
                prefetch_factor=2,
                persistent_workers=True if self.config.num_workers > 0 else False
            )
        else:
            dataloader = DataLoader(
                dataset,
                batch_size=self.config.batch_size,
                shuffle=False,
                num_workers=self.config.num_workers,
                collate_fn=self._collate_fn,
                pin_memory=True,
                prefetch_factor=2,
                persistent_workers=True if self.config.num_workers > 0 else False
            )
        
        logger.info(f"DataLoader created: {len(dataloader)} batches")
        logger.info(f"Active optimizations:")
        logger.info(f"  ‚úÖ Mixed Precision: {self.config.use_mixed_precision}")
        logger.info(f"  ‚úÖ BetterTransformer: {self.config.use_better_transformer}")
        logger.info(f"  ‚úÖ Bucket Batching: {self.config.use_bucket_batching}")
        logger.info(f"  ‚úÖ Vectorized Postproc: {self.config.use_vectorized_postproc}")
        logger.info(f"  ‚úÖ Adaptive Beams: {self.config.use_adaptive_beams}")
        
        # Generation config
        # Generation config
        base_gen_config = {
            "max_new_tokens": self.config.max_new_tokens,
            "length_penalty": self.config.length_penalty,
            "repetition_penalty": self.config.repetition_penalty,  # ADD THIS LINE
            "early_stopping": self.config.early_stopping,
            "use_cache": True,
        }
        if self.config.no_repeat_ngram_size > 0:
            base_gen_config["no_repeat_ngram_size"] = self.config.no_repeat_ngram_size
        
        # Run inference
        self.results = []
        
        with torch.inference_mode():
            for batch_idx, (batch_ids, tokenized) in enumerate(tqdm(dataloader, desc="üöÄ Translating")):
                try:
                    input_ids = tokenized.input_ids.to(self.config.device)
                    attention_mask = tokenized.attention_mask.to(self.config.device)
                    
                    # Adaptive beam size
                    beam_size = self._get_adaptive_beam_size(input_ids, attention_mask)
                    gen_config = {**base_gen_config, "num_beams": beam_size}
                    
                    # Generate
                    if self.config.use_mixed_precision:
                        with autocast():
                            outputs = self.model.generate(
                                input_ids=input_ids,
                                attention_mask=attention_mask,
                                **gen_config
                            )
                    else:
                        outputs = self.model.generate(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            **gen_config
                        )
                    
                    # Decode
                    translations = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
                    
                    # Postprocess (vectorized)
                    if self.config.use_vectorized_postproc:
                        cleaned = self.postprocessor.postprocess_batch(translations)
                    else:
                        # Fallback to single processing
                        cleaned = [self.postprocessor.postprocess_batch([t])[0] for t in translations]
                    
                    # Store
                    self.results.extend(zip(batch_ids, cleaned))
                    
                    # Checkpoint
                    self._save_checkpoint()
                    
                    # Memory cleanup
                    if torch.cuda.is_available() and batch_idx % 10 == 0:
                        torch.cuda.empty_cache()
                    
                except Exception as e:
                    logger.error(f"‚ùå Batch {batch_idx} error: {e}")
                    self.results.extend([(bid, "") for bid in batch_ids])
                    continue
        
        logger.info("‚úÖ Inference completed")
        
        # Create results
        results_df = pd.DataFrame(self.results, columns=['id', 'translation'])
        self._validate_results(results_df)
        
        return results_df
    
    def _validate_results(self, df: pd.DataFrame):
        """Validation report"""
        print("\n" + "="*60)
        print("üìä VALIDATION REPORT")
        print("="*60)
        
        empty = df['translation'].str.strip().eq('').sum()
        print(f"\nEmpty: {empty} ({empty/len(df)*100:.2f}%)")
        
        lengths = df['translation'].str.len()
        print(f"\nüìè Length stats:")
        print(f"   Mean: {lengths.mean():.1f}, Median: {lengths.median():.1f}")
        print(f"   Min: {lengths.min()}, Max: {lengths.max()}")
        
        short = ((lengths < 5) & (lengths > 0)).sum()
        if short > 0:
            print(f"   ‚ö†Ô∏è  {short} very short translations")
        
        print(f"\nüìù Sample translations:")
        for idx in [0, len(df)//2, -1]:
            s = df.iloc[idx]
            preview = s['translation'][:70] + "..." if len(s['translation']) > 70 else s['translation']
            print(f"   ID {s['id']:4d}: {preview}")
        
        print("\n" + "="*60 + "\n")

print("‚úÖ Inference engine defined")

‚úÖ Inference engine defined


## üîü Load Test Data

In [10]:
logger.info(f"Loading test data from {config.test_data_path}")

test_df = pd.read_csv(config.test_data_path, encoding='utf-8')
logger.info(f"‚úÖ Loaded {len(test_df)} test samples")

print("\nFirst 5 samples:")
print(test_df.head())

2026-02-02 12:45:52,280 - INFO - Loading test data from /kaggle/input/deep-past-initiative-machine-translation/test.csv
2026-02-02 12:45:52,296 - INFO - ‚úÖ Loaded 4 test samples



First 5 samples:
   id   text_id  line_start  line_end  \
0   0  332fda50           1         7   
1   1  332fda50           7        14   
2   2  332fda50          14        24   
3   3  332fda50          25        30   

                                     transliteration  
0  um-ma k√†-ru-um k√†-ni-ia-ma a-na aa-q√≠-il‚Ä¶ da-t...  
1  i-na mup-p√¨-im aa a-lim(ki) ia-t√π u‚Äû-m√¨-im a-n...  
2  ki-ma mup-p√¨-ni ta-√°a-me-a-ni a-ma-kam lu a-na...  
3  me-+e-er mup-p√¨-ni a-na k√†-ar k√†-ar-ma √∫ wa-ba...  


## 1Ô∏è‚É£1Ô∏è‚É£ Run Ultra-Optimized Inference

**This is the main cell - all optimizations are active!**

In [11]:
# Create engine
engine = UltraInferenceEngine(config)

# Run inference
results_df = engine.run_inference(test_df)

2026-02-02 12:45:52,335 - INFO - Loading model from /kaggle/input/final-byt5/byt5-akkadian-optimized-34x
2026-02-02 12:45:55.614879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770036355.859199      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770036355.932100      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770036356.519880      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770036356.519934      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same targe

üöÄ Translating:   0%|          | 0/4 [00:00<?, ?it/s]

2026-02-02 12:48:00,384 - INFO - ‚úÖ Inference completed



üìä VALIDATION REPORT

Empty: 0 (0.00%)

üìè Length stats:
   Mean: 154.0, Median: 148.0
   Min: 71, Max: 249

üìù Sample translations:
   ID    0: Thus says the Kanesh colony: Speak to our messengers, every single day...
   ID    1: In the tablet of the City, you wrote to me in the tablet of the City. ...
   ID    2: As soon as you hear our letter, there he has given either for anything...




## 1Ô∏è‚É£2Ô∏è‚É£ Save Results

In [12]:
# Save submission
output_path = Path(config.output_dir) / 'submission.csv'
results_df.to_csv(output_path, index=False)
logger.info(f"\n‚úÖ Submission saved to {output_path}")

# Save config
config_dict = {
    "batch_size": config.batch_size,
    "num_beams": config.num_beams,
    "length_penalty": config.length_penalty,
    "no_repeat_ngram_size": config.no_repeat_ngram_size,
    "optimizations": {
        "mixed_precision": config.use_mixed_precision,
        "better_transformer": config.use_better_transformer,
        "bucket_batching": config.use_bucket_batching,
        "vectorized_postproc": config.use_vectorized_postproc,
        "adaptive_beams": config.use_adaptive_beams,
    }
}

config_path = Path(config.output_dir) / 'ultra_config.json'
with open(config_path, 'w') as f:
    json.dump(config_dict, f, indent=2)

print("\n" + "="*60)
print("üéâ ULTRA-OPTIMIZED INFERENCE COMPLETE!")
print("="*60)
print(f"Submission file: {output_path}")
print(f"Config file: {config_path}")
print(f"Log file: {Path(config.output_dir) / 'inference_ultra.log'}")
print(f"Total translations: {len(results_df)}")
print("="*60)

2026-02-02 12:48:00,445 - INFO - 
‚úÖ Submission saved to /kaggle/working/submission.csv



üéâ ULTRA-OPTIMIZED INFERENCE COMPLETE!
Submission file: /kaggle/working/submission.csv
Config file: /kaggle/working/ultra_config.json
Log file: /kaggle/working/inference_ultra.log
Total translations: 4


## 1Ô∏è‚É£3Ô∏è‚É£ [Optional] Inspect Results

In [13]:
# Load submission
submission = pd.read_csv(output_path)

print(f"Submission shape: {submission.shape}")
print(f"\nFirst 10 translations:")
print(submission.head(10))

print(f"\nLast 10 translations:")
print(submission.tail(10))

# Statistics
lengths = submission['translation'].str.len()
print(f"\nLength distribution:")
print(lengths.describe())

# Check for issues
empty = submission['translation'].str.strip().eq('').sum()
print(f"\nEmpty translations: {empty}")

if empty > 0:
    print("\nEmpty translation IDs:")
    print(submission[submission['translation'].str.strip().eq('')]['id'].tolist())

Submission shape: (4, 2)

First 10 translations:
   id                                        translation
0   0  Thus says the Kanesh colony: Speak to our mess...
1   3  I sent our tablet to every single place and th...
2   1  In the tablet of the City, you wrote to me in ...
3   2  As soon as you hear our letter, there he has g...

Last 10 translations:
   id                                        translation
0   0  Thus says the Kanesh colony: Speak to our mess...
1   3  I sent our tablet to every single place and th...
2   1  In the tablet of the City, you wrote to me in ...
3   2  As soon as you hear our letter, there he has g...

Length distribution:
count      4.000000
mean     154.000000
std       82.093443
min       71.000000
25%       94.250000
50%      148.000000
75%      207.750000
max      249.000000
Name: translation, dtype: float64

Empty translations: 0
