In [None]:
# pip list | grep -E "numpy|scipy|transformers"
# pip install "numpy<2.0"
# pip install sacremoses

In [8]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
from typing import List, Dict

class OrderedBackTranslator:
    def __init__(self, batch_size=32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        print("Loading Hindi->English model...")
        self.hi2en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-hi-en").to(self.device)
        self.hi2en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en")
        
        print("Loading English->Hindi model...")
        self.en2hi_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi").to(self.device)
        self.en2hi_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
        
        self.batch_size = batch_size

    def translate_batch(self, texts: List[str], model, tokenizer) -> List[str]:
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, 
                         max_length=128).to(self.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=128,
                num_beams=5,
                num_return_sequences=1,
                temperature=0.8
            )
        
        return tokenizer.batch_decode(outputs, skip_special_tokens=True)

    def process_file(self, input_file: str, output_file: str, max_lines: int = None):
        # Read input file while preserving order
        print(f"Reading {input_file}...")
        ordered_sentences = []
        with open(input_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:  # Skip empty lines
                    ordered_sentences.append(line)
                if max_lines and len(ordered_sentences) >= max_lines:
                    break

        total_lines = len(ordered_sentences)
        print(f"Processing {total_lines} lines...")
        
        # Process in batches but maintain order
        translations = []
        for i in tqdm(range(0, total_lines, self.batch_size), desc="Translating"):
            batch = ordered_sentences[i:i + self.batch_size]
            
            # Hindi -> English -> Hindi
            english = self.translate_batch(batch, self.hi2en_model, self.hi2en_tokenizer)
            hindi = self.translate_batch(english, self.en2hi_model, self.en2hi_tokenizer)
            
            translations.extend(hindi)

        # Write output - alternating original and generated sentences
        print(f"Writing output to {output_file}...")
        with open(output_file, 'w', encoding='utf-8') as f:
            for orig, trans in zip(ordered_sentences, translations):
                if orig != trans:  # Only write if translation is different
                    f.write(f"{trans}\n")
                else:
                    f.write(f"{orig}\n")

def print_sample_comparisons(originals: List[str], translations: List[str], num_samples: int = 5):
    print("\nSample comparisons (Original → Generated):")
    print("-" * 80)
    for i in range(min(num_samples, len(originals))):
        print(f"Original {i+1}: {originals[i]}")
        print(f"Generated {i+1}: {translations[i]}")
        print("-" * 80)

def main():
    translator = OrderedBackTranslator(batch_size=32)
    
    input_file = "wikiExtractsData/data/train_merge.src"
    output_file = "wikiExtractsData/data/train_merge_with_errors.src"
    
    # First test with a small subset
    print("\nTesting with first 10 sentences...")
    translator.process_file(input_file, "test_output.src", max_lines=10)
    
    # Ask for confirmation to process full file
    response = input("\nContinue with full file? (y/n): ")
    if response.lower() == 'y':
        print("\nProcessing full file...")
        translator.process_file(input_file, output_file)
    else:
        print("Stopped after test run")

if __name__ == "__main__":
    main()

Using device: cuda
Loading Hindi->English model...
Loading English->Hindi model...

Testing with first 10 sentences...
Reading wikiExtractsData/data/train_merge.src...
Processing 10 lines...


Translating: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


Writing output to test_output.src...



Continue with full file? (y/n):  n


Stopped after test run


In [7]:
def test_file_ordering(input_file: str, output_file: str, num_lines: int = 5):
    """Compare the first few lines of input and output files"""
    print(f"First {num_lines} lines comparison:")
    print("-" * 80)
    
    with open(input_file, 'r', encoding='utf-8') as f:
        input_lines = [line.strip() for line in f if line.strip()][:num_lines]
    
    with open(output_file, 'r', encoding='utf-8') as f:
        output_lines = [line.strip() for line in f if line.strip()][:num_lines]
    
    for i, (input_line, output_line) in enumerate(zip(input_lines, output_lines), 1):
        print(f"Line {i}:")
        print(f"Input:  {input_line}")
        print(f"Output: {output_line}")
        print("-" * 80)

if __name__ == "__main__":
    # Test with your files
    test_file_ordering(
        "wikiExtractsData/data/train_merge.src",
        "test_output.src"
    )

First 5 lines comparison:
--------------------------------------------------------------------------------
Line 1:
Input:  तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा थी .
Output: तब राजा को एहसास हुआ कि ब्रारान दुनिया का पेशा था।
--------------------------------------------------------------------------------
Line 2:
Input:  अनेक समुदायों में देह को नदी में प्रवाहित करने की परंपरा हैं , ताकि पानी में रहने वाले विभिन्न जीवों को आहार उपलब्ध हो सके .
Output: अनेक समुदायों में, शरीर को नदी में बहने देने के लिए परम्पराएँ हैं, ताकि पानी में रहनेवाले विभिन्‍न प्राणी भोजन से प्रदान किए जा सकें ।
--------------------------------------------------------------------------------
Line 3:
Input:  डीएनए क्षति और उत्परिवर्तन के बीच अंतर करना अत्यंत महत्वपूर्ण हैं .
Output: डी. और उत्परिवर्तन के बीच फर्क सबसे महत्वपूर्ण है.
--------------------------------------------------------------------------------
Line 4:
Input:  यह खाना बनाने के काम आती है .
Output: यह खाना ब