In [23]:
pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [20]:
pip install sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install evaluate

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Using cached evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install numpy pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:

import sys
import logging
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, 
    AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorForSeq2Seq, DataCollatorWithPadding
)
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics import f1_score, precision_recall_fscore_support
import evaluate
from torch.utils.data import DataLoader
import re
import nltk
nltk.download('wordnet')
nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
#################################
# 1. MACHINE TRANSLATION MODULE #
#################################
class HindiTranslationModel:
    def __init__(self, model_name="ai4bharat/IndicBART", max_length=128):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
        self.max_length = max_length
        
    def preprocess_translation_data(self, examples, src_lang="en", tgt_lang="hi"):
        """Tokenize and prepare inputs for translation model."""
        inputs = [example for example in examples[src_lang]]
        targets = [example for example in examples[tgt_lang]]
        
        model_inputs = self.tokenizer(
            inputs, 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True,
            return_tensors="pt"
        )
        
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                targets, 
                max_length=self.max_length, 
                padding="max_length", 
                truncation=True,
                return_tensors="pt"
            ).input_ids
            
        model_inputs["labels"] = labels
        return model_inputs
    
    def load_iitb_corpus(self, path_to_data, sample_size=None):
        """Load and preprocess the IIT Bombay English-Hindi Parallel Corpus."""
        # For demonstration, using a simpler approach - in production, use proper data loading
        if os.path.exists(path_to_data):
            df = pd.read_csv(path_to_data, sep='\t', names=['en', 'hi'])
            if sample_size:
                df = df.sample(sample_size, random_state=42)
        else:
            # Create a small demo dataset if file doesn't exist
            print(f"Warning: {path_to_data} not found. Creating a small demo dataset.")
            data = {
                'en': ["Hello, how are you?", "Where is the library?", "I like Indian food."],
                'hi': ["नमस्ते, आप कैसे हैं?", "पुस्तकालय कहां है?", "मुझे भारतीय खाना पसंद है।"]
            }
            df = pd.DataFrame(data)
        
        # Split into train-val-test
        train_df = df.sample(frac=0.8, random_state=42)
        temp_df = df.drop(train_df.index)
        val_df = temp_df.sample(frac=0.5, random_state=42)
        test_df = temp_df.drop(val_df.index)
        
        # Convert to HF datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)
        
        return train_dataset, val_dataset, test_dataset
    
    def load_domain_specific_data(self, paths):
        """Load domain-specific parallel data for improved domain adaptation."""
        datasets = []
        for domain, path in paths.items():
            if os.path.exists(path):
                df = pd.read_csv(path, sep='\t', names=['en', 'hi'])
                df['domain'] = domain
                datasets.append(df)
            else:
                print(f"Warning: {path} not found. Skipping {domain} dataset.")
        
        if not datasets:
            return None, None, None
        
        combined_df = pd.concat(datasets)
        
        # Split into train-val-test
        train_df = combined_df.sample(frac=0.8, random_state=42)
        temp_df = combined_df.drop(train_df.index)
        val_df = temp_df.sample(frac=0.5, random_state=42)
        test_df = temp_df.drop(val_df.index)
        
        # Convert to HF datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)
        
        return train_dataset, val_dataset, test_dataset
    
    def compute_metrics(self, eval_preds):
        """Calculate BLEU and METEOR scores for translation evaluation."""
        preds, labels = eval_preds
        
        # Replace -100 in the labels as we can't decode them
        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
        
        # Decode predictions and references
        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        # Calculate BLEU score
        bleu = corpus_bleu(decoded_preds, [decoded_labels]).score
        
        # Calculate METEOR score (sample for demonstration)
        # In practice, calculate for all samples and average
        meteor_scores = []
        for pred, ref in zip(decoded_preds[:5], decoded_labels[:5]):  # Taking first 5 for demo
            tokenized_pred = nltk.word_tokenize(pred)
            tokenized_ref = nltk.word_tokenize(ref)
            meteor_scores.append(meteor_score([tokenized_ref], tokenized_pred))
        
        meteor = np.mean(meteor_scores) if meteor_scores else 0
        
        return {"bleu": bleu, "meteor": meteor}
    
    def fine_tune(self, train_dataset, val_dataset, output_dir, epochs=3, batch_size=16):
        """Fine-tune the translation model on Hindi-English parallel data."""
        # Preprocess datasets
        train_dataset = train_dataset.map(
            lambda examples: self.preprocess_translation_data(examples),
            batched=True,
            remove_columns=train_dataset.column_names
        )
        val_dataset = val_dataset.map(
            lambda examples: self.preprocess_translation_data(examples),
            batched=True,
            remove_columns=val_dataset.column_names
        )
        
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True
        )
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            learning_rate=5e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            weight_decay=0.01,
            save_total_limit=3,
            num_train_epochs=epochs,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            logging_dir=f"{output_dir}/logs",
            report_to="tensorboard",
        )
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics
        )
        trainer.train()
        trainer.save_model(f"{output_dir}/final_model")
        
        return trainer
    
    def translate(self, text, src_lang="en", tgt_lang="hi"):
        """Translate text from source language to target language."""
        inputs = self.tokenizer(text, return_tensors="pt").to(device)
        outputs = self.model.generate(
            **inputs,
            max_length=self.max_length,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
        translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation
    
    def batch_translate(self, texts, src_lang="en", tgt_lang="hi"):
        """Translate a batch of texts."""
        translations = []
        for text in texts:
            translation = self.translate(text, src_lang, tgt_lang)
            translations.append(translation)
        return translations
    
    def evaluate_idiomatic_expressions(self, idiomatic_dataset):
        """Specifically evaluate translation quality on idiomatic expressions."""
        sources = idiomatic_dataset['en']
        references = idiomatic_dataset['hi']
        
        # Translate sources
        translations = self.batch_translate(sources)
        
        # Compute metrics
        bleu = corpus_bleu(translations, [references]).score
        
        # Compute METEOR for a sample
        meteor_scores = []
        for trans, ref in zip(translations[:20], references[:20]):  # Sample for demonstration
            tokenized_trans = nltk.word_tokenize(trans)
            tokenized_ref = nltk.word_tokenize(ref)
            meteor_scores.append(meteor_score([tokenized_ref], tokenized_trans))
        
        meteor = np.mean(meteor_scores) if meteor_scores else 0
        return {"bleu": bleu, "meteor": meteor}
############################
# 2. TEXT GENERATION MODULE #
############################
class HindiTextGenerator:
    def __init__(self, model_name="ai4bharat/IndicBART", max_length=512):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.max_length = max_length
        
    def preprocess_generation_data(self, examples, prompt_col="prompt", completion_col="completion"):
        """Preprocess data for text generation training."""
        model_inputs = self.tokenizer(
            examples[prompt_col], 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True,
            return_tensors="pt"
        )
        
        labels = self.tokenizer(
            examples[completion_col], 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True,
            return_tensors="pt"
        ).input_ids
        
        model_inputs["labels"] = labels
        return model_inputs
    
    def load_hindi_generation_data(self, path):
        """Load data for Hindi text generation."""
        if os.path.exists(path):
            df = pd.read_csv(path)
        else:
            print(f"Warning: {path} not found. Creating a small demo dataset.")
            data = {
                'prompt': [
                    "हिंदी में एक कहानी लिखें जो एक किसान के जीवन के बारे में हो।",
                    "भारत की संस्कृति के बारे में एक निबंध लिखें।"
                ],
                'completion': [
                    "एक गाँव में एक किसान रहता था। उसका नाम राम था...",
                    "भारत एक विविधताओं वाला देश है जहाँ अनेक संस्कृतियाँ मिलकर रहती हैं..."
                ]
            }
            df = pd.DataFrame(data)
        
        # Split into train-val
        train_df = df.sample(frac=0.8, random_state=42)
        val_df = df.drop(train_df.index)
        
        # Convert to HF datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        
        return train_dataset, val_dataset
    
    def load_cultural_dataset(self, path):
        """Load culturally rich Hindi dataset for better cultural context."""
        if os.path.exists(path):
            df = pd.read_csv(path)
        else:
            print(f"Warning: {path} not found. Creating a small demo cultural dataset.")
            data = {
                'prompt': [
                    "होली के त्योहार पर एक छोटा निबंध लिखें।",
                    "भारत के गणतंत्र दिवस के महत्व पर प्रकाश डालें।"
                ],
                'completion': [
                    "होली रंगों का त्योहार है जो भारत में बड़े हर्षोल्लास के साथ मनाया जाता है...",
                    "26 जनवरी, 1950 को भारत का संविधान लागू हुआ और भारत एक पूर्ण गणराज्य बना..."
                ]
            }
            df = pd.DataFrame(data)
        
        # Split into train-val
        train_df = df.sample(frac=0.8, random_state=42)
        val_df = df.drop(train_df.index)
        
        # Convert to HF datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        
        return train_dataset, val_dataset
    
    def compute_perplexity(self, eval_dataset):
        """Compute perplexity on evaluation dataset."""
        eval_dataloader = DataLoader(
            eval_dataset, 
            batch_size=8, 
            shuffle=False
        )
        
        self.model.eval()
        total_loss = 0
        total_length = 0
        
        with torch.no_grad():
            for batch in eval_dataloader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                total_loss += loss.item() * input_ids.size(0)
                total_length += input_ids.size(0)
        
        perplexity = torch.exp(torch.tensor(total_loss / total_length))
        return perplexity.item()
    
    def fine_tune(self, train_dataset, val_dataset, output_dir, epochs=3, batch_size=8):
        """Fine-tune the text generation model."""
        # Preprocess datasets
        train_dataset = train_dataset.map(
            lambda examples: self.preprocess_generation_data(examples),
            batched=True,
            remove_columns=train_dataset.column_names
        )
        val_dataset = val_dataset.map(
            lambda examples: self.preprocess_generation_data(examples),
            batched=True,
            remove_columns=val_dataset.column_names
        )

        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            weight_decay=0.01,
            save_total_limit=3,
            num_train_epochs=epochs,
            fp16=torch.cuda.is_available(),
            logging_dir=f"{output_dir}/logs"
        )
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator
        )
        trainer.train()
        
        # Compute perplexity
        perplexity = self.compute_perplexity(val_dataset)
        print(f"Model perplexity: {perplexity}")
        
        # Save model
        trainer.save_model(f"{output_dir}/final_model")
        
        return trainer, perplexity
    
    def generate_text(self, prompt, max_length=100, num_return_sequences=1):
        """Generate Hindi text based on prompt."""
        inputs = self.tokenizer(prompt, return_tensors="pt").to(device)
        outputs = self.model.generate(
            **inputs,
            max_length=max_length,
            num_beams=5,
            no_repeat_ngram_size=2,
            num_return_sequences=num_return_sequences,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            do_sample=True
        )
        generated_texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return generated_texts
#################################
# 3. SENTIMENT ANALYSIS MODULE #
#################################
class HindiSentimentAnalyzer:
    def __init__(self, model_name="ai4bharat/MuRIL-base", num_labels=3):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels
        ).to(device)
        self.id2label = {0: "negative", 1: "neutral", 2: "positive"}
        self.label2id = {"negative": 0, "neutral": 1, "positive": 2}
    
    def preprocess_sentiment_data(self, examples, text_col="text", label_col="label", max_length=128):
        """Preprocess data for sentiment analysis."""
        model_inputs = self.tokenizer(
            examples[text_col], 
            max_length=max_length, 
            padding="max_length", 
            truncation=True,
            return_tensors="pt"
        )
        
        model_inputs["labels"] = examples[label_col]
        return model_inputs
    
    def load_hindi_sentiment_data(self, path):
        """Load Hindi sentiment analysis dataset."""
        if os.path.exists(path):
            df = pd.read_csv(path)
        else:
            print(f"Warning: {path} not found. Creating a small demo sentiment dataset.")
            data = {
                'text': [
                    "यह फिल्म बहुत अच्छी थी, मुझे बहुत पसंद आई।",
                    "सेवा की गुणवत्ता औसत थी, कुछ खास नहीं।",
                    "मैं इस उत्पाद से बिल्कुल संतुष्ट नहीं हूं, पैसे बर्बाद हो गए।"
                ],
                'label': [2, 1, 0]  # positive, neutral, negative
            }
            df = pd.DataFrame(data)
        train_df = df.sample(frac=0.7, random_state=42)
        temp_df = df.drop(train_df.index)
        val_df = temp_df.sample(frac=0.5, random_state=42)
        test_df = temp_df.drop(val_df.index)
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)
        
        return train_dataset, val_dataset, test_dataset
    
    def load_domain_specific_sentiment_data(self, paths):
        """Load domain-specific sentiment datasets for domain adaptation."""
        datasets = []
        for domain, path in paths.items():
            if os.path.exists(path):
                df = pd.read_csv(path)
                df['domain'] = domain
                datasets.append(df)
            else:
                print(f"Warning: {path} not found. Skipping {domain} dataset.")
        
        if not datasets:
            return None, None, None
        
        combined_df = pd.concat(datasets)
        
        # Split into train-val-test
        train_df = combined_df.sample(frac=0.7, random_state=42)
        temp_df = combined_df.drop(train_df.index)
        val_df = temp_df.sample(frac=0.5, random_state=42)
        test_df = temp_df.drop(val_df.index)
        
        # Convert to HF datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)
        
        return train_dataset, val_dataset, test_dataset
    
    def compute_metrics(self, eval_preds):
        """Compute F1 score and other metrics for sentiment analysis."""
        predictions, labels = eval_preds
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average='weighted'
        )
        
        accuracy = (predictions == labels).mean()
        
        return {
            "accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall
        }
    
    def fine_tune(self, train_dataset, val_dataset, output_dir, epochs=5, batch_size=16):
        """Fine-tune sentiment analysis model."""
        # Preprocess datasets
        train_dataset = train_dataset.map(
            lambda examples: self.preprocess_sentiment_data(examples),
            batched=True,
            remove_columns=train_dataset.column_names
        )
        val_dataset = val_dataset.map(
            lambda examples: self.preprocess_sentiment_data(examples),
            batched=True,
            remove_columns=val_dataset.column_names
        )
        
        # Data collator
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            weight_decay=0.01,
            save_total_limit=3,
            num_train_epochs=epochs,
            fp16=torch.cuda.is_available(),
            logging_dir=f"{output_dir}/logs"
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics
        )
        
        # Train model
        trainer.train()
        
        # Save model
        trainer.save_model(f"{output_dir}/final_model")
        
        return trainer
    
    def predict_sentiment(self, text):
        """Predict sentiment of Hindi text."""
        inputs = self.tokenizer(text, return_tensors="pt").to(device)
        
        # Predict
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
            predicted_class = torch.argmax(predictions, dim=1).item()
        
        # Get sentiment and confidence
        sentiment = self.id2label[predicted_class]
        confidence = predictions[0][predicted_class].item()
        
        return {
            "sentiment": sentiment,
            "confidence": confidence,
            "probabilities": {
                self.id2label[i]: prob.item() 
                for i, prob in enumerate(predictions[0])
            }
        }


##############################
# 4. INTEGRATED PIPELINE    #
##############################

class HindiNLPPipeline:
    def __init__(self):
        self.translation_model = None
        self.generation_model = None
        self.sentiment_model = None
    
    def initialize_translation(self, model_name="ai4bharat/IndicBART"):
        """Initialize translation model."""
        self.translation_model = HindiTranslationModel(model_name)
        return self.translation_model
    
    def initialize_generation(self, model_name="ai4bharat/IndicBART"):
        """Initialize text generation model."""
        self.generation_model = HindiTextGenerator(model_name)
        return self.generation_model
    
    def initialize_sentiment(self, model_name="ai4bharat/MuRIL-base"):
        """Initialize sentiment analysis model."""
        self.sentiment_model = HindiSentimentAnalyzer(model_name)
        return self.sentiment_model
    
    def translate(self, text, src_lang="en", tgt_lang="hi"):
        """Translate text using the translation model."""
        if not self.translation_model:
            self.initialize_translation()
        return self.translation_model.translate(text, src_lang, tgt_lang)
    
    def generate(self, prompt, max_length=100):
        """Generate text using the generation model."""
        if not self.generation_model:
            self.initialize_generation()
        return self.generation_model.generate_text(prompt, max_length)
    
    def analyze_sentiment(self, text):
        """Analyze sentiment of text using the sentiment model."""
        if not self.sentiment_model:
            self.initialize_sentiment()
        return self.sentiment_model.predict_sentiment(text)
    
    def process_hinglish(self, text):
        """Process code-mixed Hinglish text."""
        roman_to_hindi = {
            'namaste': 'नमस्ते',
            'kaise': 'कैसे',
            'ho': 'हो',
            'aap': 'आप',
            'main': 'मैं',
            'hoon': 'हूँ',
            # Add more mappings for common words
        }
        
        words = text.split()
        for i, word in enumerate(words):
            if word.lower() in roman_to_hindi:
                words[i] = roman_to_hindi[word.lower()]
        
        processed_text = ' '.join(words)
        return processed_text
    
    def end_to_end_processing(self, text, task="all"):
        """Perform end-to-end processing on input text."""
        results = {}
        
        # Check if text is Hinglish and process if needed
        if any(ord(c) < 128 for c in text) and any(ord(c) >= 128 for c in text):
            text = self.process_hinglish(text)
            results["processed_text"] = text
        
        # Perform requested tasks
        if task in ["translate", "all"]:
            # Auto-detect language and translate to Hindi if not already in Hindi
            is_hindi = all(ord(c) >= 128 for c in text if c.strip())
            if not is_hindi:
                translation = self.translate(text, src_lang="en", tgt_lang="hi")
                results["translation"] = translation
        
        if task in ["generate", "all"]:
            # If text is a prompt, generate content based on it
            generated_text = self.generate(text)
            results["generated_text"] = generated_text
        
        if task in ["sentiment", "all"]:
            # Analyze sentiment
            sentiment_result = self.analyze_sentiment(text)
            results["sentiment_analysis"] = sentiment_result
        
        return results
#######################################
# 5. SPECIALIZED CULTURAL ADAPTATION  #
#######################################
class CulturalAdaptation:
    def __init__(self, base_model):
        """Initialize with a base model (translation, generation, or sentiment)."""
        self.base_model = base_model
        self.culture_specific_data = {}
    
    def load_cultural_expressions(self, path):
        """Load dataset of cultural expressions, idioms, and references."""
        if os.path.exists(path):
            df = pd.read_csv(path)
            self.culture_specific_data = df.to_dict('records')
            return True
        else:
            print(f"Warning: {path} not found. Cultural adaptation will be limited.")
            # Sample data
            self.culture_specific_data = [
                {"expression": "लालन-पालन", "meaning": "upbringing", "context": "family"},
                {"expression": "अतिथि देवो भव:", "meaning": "guest is god", "context": "hospitality"}
            ]
            return False
    
    def enhance_translation(self, text, translation):
        """Enhance translation with cultural context."""
        enhanced = translation
        for item in self.culture_specific_data:
            if item["expression"] in text:
                # If cultural expression is found, ensure it's properly translated
                if item["meaning"] not in translation.lower():
                    # This is simplified - in practice, would need more sophisticated replacement
                    enhanced = enhanced + f" [{item['expression']}: {item['meaning']}]"
        
        return enhanced
    
    def enhance_text_generation(self, prompt, generated_text):
        """Enhance generated text with cultural context."""
        # Check if prompt involves cultural themes
        cultural_contexts = set([item["context"] for item in self.culture_specific_data])
        
        relevant_contexts = []
        for context in cultural_contexts:
            if context in prompt.lower():
                relevant_contexts.append(context)
        
        if not relevant_contexts:
            return generated_text
        
        # Find relevant cultural expressions for these contexts
        relevant_expressions = [
            item for item in self.culture_specific_data 
            if item["context"] in relevant_contexts
        ]
        enhanced_text = generated_text
        for expr in relevant_expressions[:2]:  # Limit to 2 expressions to avoid overloading
            if expr["expression"] not in enhanced_text:
                # Find suitable place to insert expression
                sentences = enhanced_text.split('।')
                if len(sentences) > 1:
                    insertion_point = len(sentences) // 2
                    sentences[insertion_point] = f"{sentences[insertion_point]} {expr['expression']}"
                    enhanced_text = '।'.join(sentences)
        
        return enhanced_text
    
    def enhance_sentiment_analysis(self, text, sentiment_result):
        """Enhance sentiment analysis with cultural context."""
        enhanced_result = sentiment_result.copy()
        
        # Check for cultural expressions that might affect sentiment
        for item in self.culture_specific_data:
            if item["expression"] in text:
                # Add cultural context to analysis
                if "cultural_context" not in enhanced_result:
                    enhanced_result["cultural_context"] = []
                
                enhanced_result["cultural_context"].append({
                    "expression": item["expression"],
                    "meaning": item["meaning"],
                    "context": item["context"]
                })
                
                # Adjust confidence if necessary
                if enhanced_result["confidence"] > 0.8:
                    enhanced_result["confidence"] *= 0.95  # Slightly reduce very high confidence
                
                # Add explanation
                if "explanation" not in enhanced_result:
                    enhanced_result["explanation"] = []
                
                enhanced_result["explanation"].append(
                    f"Result may be influenced by cultural expression '{item['expression']}'."
                )
        
        return enhanced_result
#################################
# 6. CODE-MIXING HANDLER       #
#################################
class HinglishProcessor:
    def __init__(self):
        """Initialize Hinglish processor for code-mixed text."""
        self.hindi_tokenizer = None
        self.english_tokenizer = None
        self.transliteration_map = self._load_transliteration_map()
        
    def _load_transliteration_map(self):
        """Load or create transliteration mapping from Roman to Devanagari."""
        return {
            # Basic vowels
            'a': 'अ', 'aa': 'आ', 'i': 'इ', 'ee': 'ई', 'u': 'उ', 'oo': 'ऊ',
            'e': 'ए', 'ai': 'ऐ', 'o': 'ओ', 'au': 'औ',
            
            # Consonants
            'k': 'क', 'kh': 'ख', 'g': 'ग', 'gh': 'घ', 'ng': 'ङ',
            'ch': 'च', 'chh': 'छ', 'j': 'ज', 'jh': 'झ', 'n': 'ञ',
            't': 'ट', 'th': 'ठ', 'd': 'ड', 'dh': 'ढ', 'n': 'ण',
            'th': 'त', 'thh': 'थ', 'd': 'द', 'dh': 'ध', 'n': 'न',
            'p': 'प', 'ph': 'फ', 'b': 'ब', 'bh': 'भ', 'm': 'म',
            'y': 'य', 'r': 'र', 'l': 'ल', 'v': 'व', 'sh': 'श',
            's': 'स', 'h': 'ह',
            
            # Common words
            'hai': 'है', 'hain': 'हैं', 'kya': 'क्या', 'main': 'मैं',
            'aap': 'आप', 'tum': 'तुम', 'yeh': 'यह', 'woh': 'वह',
            'aur': 'और', 'par': 'पर', 'mein': 'में', 'se': 'से',
            'ka': 'का', 'ki': 'की', 'ke': 'के', 'ko': 'को',
            'namaste': 'नमस्ते', 'dhanyavaad': 'धन्यवाद'
        }
    
    def detect_language_word(self, word):
        """Detect if a word is English, Hindi in Roman script, or Hindi in Devanagari."""
        if any(0x900 <= ord(c) <= 0x97F for c in word):
            return "hindi_dev"  # Hindi in Devanagari
        
        # Check if word is in transliteration map
        if word.lower() in self.transliteration_map:
            return "hindi_roman"  # Hindi in Roman script
        
        # Basic heuristic for Hindi words in Roman script
        hindi_suffixes = ['aa', 'ee', 'oo', 'ai', 'au', 'an', 'en', 'in']
        if any(word.lower().endswith(suffix) for suffix in hindi_suffixes):
            return "hindi_roman"
        
        return "english"  # Default to English
    
    def transliterate_roman_to_devanagari(self, text):
        """Transliterate Roman script Hindi to Devanagari."""
        words = text.split()
        transliterated_words = []
        
        for word in words:
            language = self.detect_language_word(word)
            
            if language == "hindi_dev":
                transliterated_words.append(word)  # Already Devanagari
            elif language == "hindi_roman":
                if word.lower() in self.transliteration_map:
                    transliterated_words.append(self.transliteration_map[word.lower()])
                else:
                    # Try to match parts of words - simplified approach
                    # A real implementation would use a more sophisticated algorithm
                    processed_word = word
                    for rom, dev in self.transliteration_map.items():
                        processed_word = processed_word.replace(rom, dev)
                    transliterated_words.append(processed_word)
            else:
                # Keep English words as is
                transliterated_words.append(word)
        
        return " ".join(transliterated_words)
    
    def normalize_hinglish(self, text):
        """Normalize Hinglish text for better processing."""
        # Transliterate Hindi words in Roman script to Devanagari
        normalized_text = self.transliterate_roman_to_devanagari(text)
        
        # Handle English words and punctuation
        # Keep this in a more consistent format for downstream processing
        
        return normalized_text
    
    def tag_language_tokens(self, text):
        """Tag each token in the text with its language."""
        words = text.split()
        tagged_words = []
        
        for word in words:
            language = self.detect_language_word(word)
            tagged_words.append((word, language))
        
        return tagged_words
    
    def preprocess_for_nlp_tasks(self, text):
        """Preprocess Hinglish text for NLP tasks."""
        # First, normalize the text
        normalized_text = self.normalize_hinglish(text)
        
        # Tag tokens with languages
        tagged_tokens = self.tag_language_tokens(normalized_text)
        
        # Create two versions of the text:
        # 1. Hindi-only (with English words preserved but marked)
        # 2. Fully transliterated (all Hindi in Devanagari)
        
        hindi_only = []
        fully_transliterated = []
        
        for word, lang in tagged_tokens:
            if lang == "english":
                hindi_only.append(f"[ENG]{word}[/ENG]")
                fully_transliterated.append(word)  # Keep English as is
            elif lang == "hindi_roman":
                if word.lower() in self.transliteration_map:
                    dev_word = self.transliteration_map[word.lower()]
                    hindi_only.append(dev_word)
                    fully_transliterated.append(dev_word)
                else:
                    hindi_only.append(word)  # Keep as is if not in map
                    fully_transliterated.append(word)
            else:  # hindi_dev
                hindi_only.append(word)
                fully_transliterated.append(word)
        
        return {
            "normalized": normalized_text,
            "hindi_only": " ".join(hindi_only),
            "fully_transliterated": " ".join(fully_transliterated),
            "tagged_tokens": tagged_tokens
        }


#################################
# 7. DOMAIN ADAPTATION MODULE   #
#################################

class DomainAdapter:
    def __init__(self, base_model, model_type="translation"):
        """
        Initialize domain adapter for specialized Hindi NLP tasks.
        
        Args:
            base_model: The base model to adapt (translation, generation, or sentiment)
            model_type: Type of model ("translation", "generation", or "sentiment")
        """
        self.base_model = base_model
        self.model_type = model_type
        self.domain_data = {}
    
    def load_domain_data(self, domain, path):
        """Load domain-specific data."""
        if os.path.exists(path):
            if self.model_type == "translation":
                # Load parallel corpus for the domain
                df = pd.read_csv(path, sep='\t', names=['en', 'hi'])
            elif self.model_type == "generation":
                # Load prompts and completions for the domain
                df = pd.read_csv(path)
            elif self.model_type == "sentiment":
                # Load text and sentiment labels for the domain
                df = pd.read_csv(path)
            
            self.domain_data[domain] = df
            return True
        else:
            print(f"Warning: {path} not found. Domain adaptation for {domain} will be limited.")
            return False
    
    def create_domain_specific_dataset(self, domain):
        """Create domain-specific dataset for fine-tuning."""
        if domain not in self.domain_data:
            print(f"Error: Data for domain {domain} not loaded.")
            return None
        
        df = self.domain_data[domain]
        
        # Split into train-val
        train_df = df.sample(frac=0.8, random_state=42)
        val_df = df.drop(train_df.index)
        
        # Convert to HF datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        
        return train_dataset, val_dataset
    
    def augment_domain_data(self, domain):
        """Augment domain data using basic techniques."""
        if domain not in self.domain_data:
            print(f"Error: Data for domain {domain} not loaded.")
            return
        
        df = self.domain_data[domain]
        augmented_rows = []
        
        for idx, row in df.iterrows():
            if self.model_type == "translation":
                # For translation: create variations by adding/removing words
                en_text = row['en']
                hi_text = row['hi']
                
                # Simple augmentation: add/remove determiners or adjectives
                # This is a simplified approach - more sophisticated techniques would be used
                augmented_rows.append({
                    'en': en_text.replace('the ', ''),
                    'hi': hi_text
                })
                
            elif self.model_type == "generation":
                # For generation: create variations of prompts
                prompt = row['prompt']
                completion = row['completion']
                
                # Rephrase prompt slightly
                augmented_rows.append({
                    'prompt': f"कृपया {prompt}",  # Add "please" in Hindi
                    'completion': completion
                })
                
            elif self.model_type == "sentiment":
                # For sentiment: simple word replacements/additions
                text = row['text']
                label = row['label']
                
                # Add intensifiers for positive/negative sentiments
                if label == 2:  # positive
                    augmented_rows.append({
                        'text': f"बहुत {text}",  # Add "very" in Hindi
                        'label': label
                    })
                elif label == 0:  # negative
                    augmented_rows.append({
                        'text': f"बिल्कुल {text}",  # Add "absolutely" in Hindi
                        'label': label
                    })
        
        # Add augmented data to original dataset
        augmented_df = pd.DataFrame(augmented_rows)
        self.domain_data[domain] = pd.concat([df, augmented_df])
    
    def fine_tune_domain_model(self, domain, output_dir, epochs=3):
        """Fine-tune model on domain-specific data."""
        if domain not in self.domain_data:
            print(f"Error: Data for domain {domain} not loaded.")
            return None
        
        # Create domain dataset
        train_dataset, val_dataset = self.create_domain_specific_dataset(domain)
        
        # Determine how to fine-tune based on model type
        if self.model_type == "translation":
            trainer = self.base_model.fine_tune(
                train_dataset, 
                val_dataset, 
                output_dir=f"{output_dir}/{domain}", 
                epochs=epochs
            )
        elif self.model_type == "generation":
            trainer, _ = self.base_model.fine_tune(
                train_dataset, 
                val_dataset, 
                output_dir=f"{output_dir}/{domain}", 
                epochs=epochs
            )
        elif self.model_type == "sentiment":
            trainer = self.base_model.fine_tune(
                train_dataset, 
                val_dataset, 
                output_dir=f"{output_dir}/{domain}", 
                epochs=epochs
            )
        
        print(f"Domain adaptation for {domain} completed.")
        return trainer


#################################
# 8. EVALUATION MODULE         #
#################################

class HindiNLPEvaluator:
    def __init__(self):
        """Initialize evaluator for Hindi NLP tasks."""
        self.metrics = {
            "translation": ["bleu", "meteor", "human_eval"],
            "generation": ["perplexity", "human_eval"],
            "sentiment": ["f1", "accuracy", "precision", "recall"]
        }
        self.human_evaluators = []  # Would contain evaluator information in real implementation
    
    def evaluate_translation(self, model, test_dataset, metrics=None):
        """
        Evaluate translation model with specified metrics.
        
        Args:
            model: HindiTranslationModel to evaluate
            test_dataset: Test dataset with source and reference translations
            metrics: List of metrics to use (defaults to all available)
        
        Returns:
            Dictionary of evaluation results
        """
        if metrics is None:
            metrics = ["bleu", "meteor"]
        
        results = {}
        sources = test_dataset['en']
        references = test_dataset['hi']
        
        # Generate translations
        translations = []
        for source in sources:
            translation = model.translate(source)
            translations.append(translation)
        
        # Calculate BLEU
        if "bleu" in metrics:
            bleu = corpus_bleu(translations, [references]).score
            results["bleu"] = bleu
        
        # Calculate METEOR for a sample (for efficiency)
        if "meteor" in metrics:
            meteor_scores = []
            sample_size = min(50, len(translations))  # Limit sample size
            for trans, ref in zip(translations[:sample_size], references[:sample_size]):
                tokenized_trans = nltk.word_tokenize(trans)
                tokenized_ref = nltk.word_tokenize(ref)
                meteor_scores.append(meteor_score([tokenized_ref], tokenized_trans))
            
            meteor = np.mean(meteor_scores) if meteor_scores else 0
            results["meteor"] = meteor
        
        # Human evaluation would be implemented here in a real system
        if "human_eval" in metrics and self.human_evaluators:
            # Placeholder for human evaluation logic
            results["human_eval"] = {
                "fluency": 0.0,
                "adequacy": 0.0,
                "idiomaticity": 0.0
            }
        
        return results
    
    def evaluate_generation(self, model, test_prompts, metrics=None):
        """
        Evaluate text generation model.
        
        Args:
            model: HindiTextGenerator to evaluate
            test_prompts: List of prompts to generate from
            metrics: List of metrics to use
        
        Returns:
            Dictionary of evaluation results
        """
        if metrics is None:
            metrics = ["perplexity"]
        
        results = {}
        
        # Generate text for each prompt
        generated_texts = []
        for prompt in test_prompts:
            generated = model.generate_text(prompt)[0]
            generated_texts.append(generated)
        
        # Calculate perplexity if applicable
        if "perplexity" in metrics and hasattr(model, "compute_perplexity"):
            # This would require test dataset in the right format
            # results["perplexity"] = model.compute_perplexity(test_dataset)
            results["perplexity"] = "Needs formatted test dataset"
        
        # Human evaluation would be implemented here
        if "human_eval" in metrics and self.human_evaluators:
            # Placeholder for human evaluation logic
            results["human_eval"] = {
                "fluency": 0.0,
                "coherence": 0.0,
                "cultural_relevance": 0.0
            }
        
        return results
    
    def evaluate_sentiment(self, model, test_dataset, metrics=None):
        """
        Evaluate sentiment analysis model.
        
        Args:
            model: HindiSentimentAnalyzer to evaluate
            test_dataset: Test dataset with text and labels
            metrics: List of metrics to use
        
        Returns:
            Dictionary of evaluation results
        """
        if metrics is None:
            metrics = ["f1", "accuracy", "precision", "recall"]
        
        results = {}
        texts = test_dataset['text']
        true_labels = test_dataset['label']
        
        # Generate predictions
        predicted_labels = []
        predicted_probs = []
        
        for text in texts:
            prediction = model.predict_sentiment(text)
            label_id = model.label2id[prediction["sentiment"]]
            predicted_labels.append(label_id)
            
            # Store probabilities for each class for ROC analysis
            probs = [prediction["probabilities"][label] for label in model.id2label.values()]
            predicted_probs.append(probs)
        
        # Calculate metrics
        if any(metric in ["f1", "precision", "recall"] for metric in metrics):
            precision, recall, f1, _ = precision_recall_fscore_support(
                true_labels, predicted_labels, average='weighted'
            )
            
            if "precision" in metrics:
                results["precision"] = precision
            if "recall" in metrics:
                results["recall"] = recall
            if "f1" in metrics:
                results["f1"] = f1
        
        if "accuracy" in metrics:
            accuracy = (np.array(predicted_labels) == np.array(true_labels)).mean()
            results["accuracy"] = accuracy
        
        return results
    
    def evaluate_across_domains(self, model, domain_datasets, task="translation"):
        """
        Evaluate model performance across different domains.
        
        Args:
            model: Model to evaluate
            domain_datasets: Dictionary of domain-specific test datasets
            task: Task type ("translation", "generation", "sentiment")
        
        Returns:
            Dictionary of evaluation results by domain
        """
        domain_results = {}
        
        for domain, dataset in domain_datasets.items():
            print(f"Evaluating {task} performance on {domain} domain...")
            
            if task == "translation":
                results = self.evaluate_translation(model, dataset)
            elif task == "generation":
                # For generation, dataset should contain prompts
                prompts = dataset['prompt'] if 'prompt' in dataset.column_names else dataset['text']
                results = self.evaluate_generation(model, prompts)
            elif task == "sentiment":
                results = self.evaluate_sentiment(model, dataset)
            
            domain_results[domain] = results
        
        return domain_results
    
    def evaluate_cultural_context(self, model, cultural_test_set, task="translation"):
        """
        Specifically evaluate model's handling of cultural context.
        
        Args:
            model: Model to evaluate
            cultural_test_set: Test set rich in cultural references
            task: Task type
        
        Returns:
            Evaluation results focused on cultural aspects
        """
        # This would be implemented with specific cultural evaluation metrics
        # Placeholder implementation
        print("Evaluating cultural context handling...")
        
        if task == "translation":
            results = self.evaluate_translation(model, cultural_test_set)
        elif task == "generation":
            prompts = cultural_test_set['prompt'] if 'prompt' in cultural_test_set.column_names else cultural_test_set['text']
            results = self.evaluate_generation(model, prompts)
        elif task == "sentiment":
            results = self.evaluate_sentiment(model, cultural_test_set)
        
        # Additional cultural-specific metrics would be added here
        
        return results


#################################
# 9. MAIN APPLICATION          #
#################################

class HindiNLPApp:
    def __init__(self):
        """Initialize the Hindi NLP application framework."""
        self.pipeline = HindiNLPPipeline()
        self.hinglish_processor = HinglishProcessor()
        self.evaluator = HindiNLPEvaluator()
        self.cultural_adapter = None
        self.domain_adapters = {}
    
    def initialize_models(self, translation_model="ai4bharat/IndicBART", 
                          generation_model="ai4bharat/IndicBART",
                          sentiment_model="ai4bharat/MuRIL-base"):
        """Initialize all models."""
        print("Initializing translation model...")
        self.pipeline.initialize_translation(translation_model)
        
        print("Initializing text generation model...")
        self.pipeline.initialize_generation(generation_model)
        
        print("Initializing sentiment analysis model...")
        self.pipeline.initialize_sentiment(sentiment_model)
        
        # Initialize cultural adaptation for all models
        self.cultural_adapter = CulturalAdaptation(self.pipeline)
        self.cultural_adapter.load_cultural_expressions("path/to/cultural_expressions.csv")
        
        # Initialize domain adapters
        self.domain_adapters["translation"] = DomainAdapter(
            self.pipeline.translation_model, model_type="translation"
        )
        self.domain_adapters["generation"] = DomainAdapter(
            self.pipeline.generation_model, model_type="generation"
        )
        self.domain_adapters["sentiment"] = DomainAdapter(
            self.pipeline.sentiment_model, model_type="sentiment"
        )
        
        print("All models initialized successfully.")
    
    def load_domain_datasets(self, domains=["legal", "technical", "medical"]):
        """Load domain-specific datasets for all tasks."""
        for domain in domains:
            # Paths would be configured properly in real implementation
            print(f"Loading {domain} domain datasets...")
            
            # For translation
            self.domain_adapters["translation"].load_domain_data(
                domain, f"data/domains/{domain}/translation.csv"
            )
            
            # For generation
            self.domain_adapters["generation"].load_domain_data(
                domain, f"data/domains/{domain}/generation.csv"
            )
            
            # For sentiment
            self.domain_adapters["sentiment"].load_domain_data(
                domain, f"data/domains/{domain}/sentiment.csv"
            )
    
    def fine_tune_all_models(self, output_dir="models/fine_tuned"):
        """Fine-tune all models on available datasets."""
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Fine-tune translation model
        print("Fine-tuning translation model...")
        if self.pipeline.translation_model:
            train_dataset, val_dataset, _ = self.pipeline.translation_model.load_iitb_corpus(
                "data/iitb_corpus.csv", sample_size=10000
            )
            self.pipeline.translation_model.fine_tune(
                train_dataset, val_dataset, f"{output_dir}/translation"
            )
        
        # Fine-tune generation model
        print("Fine-tuning text generation model...")
        if self.pipeline.generation_model:
            train_dataset, val_dataset = self.pipeline.generation_model.load_hindi_generation_data(
                "data/hindi_generation.csv"
            )
            self.pipeline.generation_model.fine_tune(
                train_dataset, val_dataset, f"{output_dir}/generation"
            )
        
        # Fine-tune sentiment model
        print("Fine-tuning sentiment analysis model...")
        if self.pipeline.sentiment_model:
            train_dataset, val_dataset, _ = self.pipeline.sentiment_model.load_hindi_sentiment_data(
                "data/hindi_sentiment.csv"
            )
            self.pipeline.sentiment_model.fine_tune(
                train_dataset, val_dataset, f"{output_dir}/sentiment"
            )
        
        print("Basic fine-tuning completed for all models.")
    
    def fine_tune_domain_models(self, domains=["legal", "technical", "medical"], 
                               output_dir="models/domain_adapted"):
        """Fine-tune models for specific domains."""
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        for domain in domains:
            print(f"Fine-tuning models for {domain} domain...")
            
            # Fine-tune translation for domain
            if "translation" in self.domain_adapters:
                self.domain_adapters["translation"].fine_tune_domain_model(
                    domain, f"{output_dir}/translation"
                )
            
            # Fine-tune generation for domain
            if "generation" in self.domain_adapters:
                self.domain_adapters["generation"].fine_tune_domain_model(
                    domain, f"{output_dir}/generation"
                )
            
            # Fine-tune sentiment for domain
            if "sentiment" in self.domain_adapters:
                self.domain_adapters["sentiment"].fine_tune_domain_model(
                    domain, f"{output_dir}/sentiment"
                )
    
    def evaluate_all_models(self, test_datasets):
        """Evaluate all models on test datasets."""
        results = {}
        
        # Evaluate translation model
        if self.pipeline.translation_model and "translation" in test_datasets:
            print("Evaluating translation model...")
            results["translation"] = self.evaluator.evaluate_translation(
                self.pipeline.translation_model, test_datasets["translation"]
            )
        
        # Evaluate generation model
        if self.pipeline.generation_model and "generation" in test_datasets:
            print("Evaluating text generation model...")
            prompts = test_datasets["generation"]['prompt'] if 'prompt' in test_datasets["generation"].column_names else test_datasets["generation"]['text']
            results["generation"] = self.evaluator.evaluate_generation(
                self.pipeline.generation_model, prompts
            )
        
        # Evaluate sentiment model
        if self.pipeline.sentiment_model and "sentiment" in test_datasets:
            print("Evaluating sentiment analysis model...")
            results["sentiment"] = self.evaluator.evaluate_sentiment(
                self.pipeline.sentiment_model, test_datasets["sentiment"]
            )
        
        return results
    
    def process_text(self, text, task="all"):
        """Process text using the appropriate pipeline based on task."""
        # First, check if text is Hinglish and process if needed
        if any(0x900 <= ord(c) <= 0x97F for c in text) and any(ord(c) < 128 for c in text):
            print("Detected code-mixed text (Hinglish), preprocessing...")
            processed = self.hinglish_processor.preprocess_for_nlp_tasks(text)
            text = processed["hindi_only"]  # Use Hindi-only version
        
        # Process with the main pipeline
        results = self.pipeline.end_to_end_processing(text, task)
        
        # Enhanced with cultural context if applicable
        if task in ["translate", "all"] and "translation" in results:
            results["translation_with_cultural_context"] = self.cultural_adapter.enhance_translation(
                text, results["translation"]
            )
        
        if task in ["generate", "all"] and "generated_text" in results:
            results["generated_text_with_cultural_context"] = self.cultural_adapter.enhance_text_generation(
                text, results["generated_text"][0]
            )
        
        if task in ["sentiment", "all"] and "sentiment_analysis" in results:
            results["sentiment_with_cultural_context"] = self.cultural_adapter.enhance_sentiment_analysis(
                text, results["sentiment_analysis"]
            )
        
        return results
    
    def demo(self):
        """Run a demonstration of the Hindi NLP capabilities."""
        print("Hindi NLP Framework Demonstration")
        print("================================\n")
        
        # 1. Translation demo
        print("1. Machine Translation Demo")
        english_texts = [
            "Hello, how are you?",
            "India is a diverse country with many cultures and languages.",
            "The weather is quite pleasant today."
        ]
        
        print("\nEnglish to Hindi Translation:")
        for text in english_texts:
            translation = self.pipeline.translate(text)
            print(f"EN: {text}")
            print(f"HI: {translation}\n")
        
        # 2. Text Generation demo
        print("\n2. Hindi Text Generation Demo")
        hindi_prompts = [
            "भारत की संस्कृति के बारे में लिखें।",
            "दिल्ली शहर का वर्णन करें।"
        ]
        
        print("\nHindi Text Generation:")
        for prompt in hindi_prompts:
            generated = self.pipeline.generate(prompt)[0]
            print(f"Prompt: {prompt}")
            print(f"Generated: {generated[:200]}...\n")
        
        # 3. Sentiment Analysis demo
# Complete the Sentiment Analysis Demo section
        hindi_texts = [
            "यह फिल्म बहुत अच्छी थी, मुझे बहुत पसंद आई।",
            "सेवा बहुत खराब थी और खाना भी ठंडा था।",
            "इस उत्पाद के बारे में मेरी कोई विशेष राय नहीं है।"
        ]
        
        print("\nHindi Sentiment Analysis:")
        for text in hindi_texts:
            sentiment = self.pipeline.analyze_sentiment(text)
            print(f"Text: {text}")
            print(f"Sentiment: {sentiment['sentiment']}, Confidence: {sentiment['confidence']:.2f}\n")
        
        # 4. Hinglish Processing demo
        print("\n4. Hinglish Processing Demo")
        hinglish_texts = [
            "Main kal movie dekhne gaya tha and it was amazing!",
            "Kya aap mujhe bata sakte hain ki yeh kaise kaam karta hai?",
            "The weather aaj bahut acha hai."
        ]
        
        print("\nHinglish Processing:")
        for text in hinglish_texts:
            processed = self.hinglish_processor.preprocess_for_nlp_tasks(text)
            print(f"Original: {text}")
            print(f"Normalized: {processed['normalized']}")
            print(f"Hindi Only: {processed['hindi_only']}")
            print(f"Fully Transliterated: {processed['fully_transliterated']}\n")
        
        # 5. Cultural Adaptation demo
        print("\n5. Cultural Adaptation Demo")
        cultural_texts = [
            "During Diwali, people light lamps and celebrate with family.",
            "The wedding ceremony was followed by a grand reception."
        ]
        
        print("\nCultural Context Enhancement:")
        for text in cultural_texts:
            translation = self.pipeline.translate(text)
            enhanced = self.cultural_adapter.enhance_translation(text, translation)
            print(f"Original: {text}")
            print(f"Basic Translation: {translation}")
            print(f"Culturally Enhanced: {enhanced}\n")
        
        print("\nDemonstration completed successfully!")

    def run_batch_processing(self, input_file, output_file, task="translate"):
        """Process a batch of texts from a file."""
        # Check if file exists
        if not os.path.exists(input_file):
            print(f"Error: Input file {input_file} not found.")
            return False
        
        # Read input file
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # Process each line
        results = []
        for i, line in enumerate(lines):
            line = line.strip()
            if line:
                print(f"Processing line {i+1}/{len(lines)}...")
                result = self.process_text(line, task)
                results.append(result)
        
        # Write results to output file
        with open(output_file, 'w', encoding='utf-8') as f:
            for i, result in enumerate(results):
                f.write(f"Input {i+1}: {lines[i].strip()}\n")
                
                if task == "translate" or task == "all":
                    if "translation" in result:
                        f.write(f"Translation: {result['translation']}\n")
                    if "translation_with_cultural_context" in result:
                        f.write(f"Enhanced Translation: {result['translation_with_cultural_context']}\n")
                
                if task == "generate" or task == "all":
                    if "generated_text" in result:
                        f.write(f"Generated Text: {result['generated_text'][0][:200]}...\n")
                
                if task == "sentiment" or task == "all":
                    if "sentiment_analysis" in result:
                        sentiment = result["sentiment_analysis"]
                        f.write(f"Sentiment: {sentiment['sentiment']}, Confidence: {sentiment['confidence']:.2f}\n")
                
                f.write("\n" + "-"*50 + "\n\n")
        
        print(f"Batch processing completed. Results saved to {output_file}")
        return True

    def save_models(self, output_dir="models/saved"):
        """Save all trained models."""
        os.makedirs(output_dir, exist_ok=True)
        
        # Save translation model
        if self.pipeline.translation_model:
            print("Saving translation model...")
            self.pipeline.translation_model.save_model(f"{output_dir}/translation")
        
        # Save generation model
        if self.pipeline.generation_model:
            print("Saving text generation model...")
            self.pipeline.generation_model.save_model(f"{output_dir}/generation")
        
        # Save sentiment model
        if self.pipeline.sentiment_model:
            print("Saving sentiment analysis model...")
            self.pipeline.sentiment_model.save_model(f"{output_dir}/sentiment")
        
        print(f"All models saved to {output_dir}")

    def load_models(self, input_dir="models/saved"):
        """Load saved models."""
        # Load translation model
        if os.path.exists(f"{input_dir}/translation"):
            print("Loading translation model...")
            self.pipeline.translation_model.load_model(f"{input_dir}/translation")
        
        # Load generation model
        if os.path.exists(f"{input_dir}/generation"):
            print("Loading text generation model...")
            self.pipeline.generation_model.load_model(f"{input_dir}/generation")
        
        # Load sentiment model
        if os.path.exists(f"{input_dir}/sentiment"):
            print("Loading sentiment analysis model...")
            self.pipeline.sentiment_model.load_model(f"{input_dir}/sentiment")
        
        print("Models loaded successfully.")


#################################
# 10. API INTEGRATION MODULE   #
#################################

class HindiNLPAPI:
    def __init__(self, app):
        """Initialize API with reference to main application."""
        self.app = app
    
    def translate_text(self, text, source_language="en", target_language="hi", domain=None):
        """API endpoint for translation."""
        response = {
            "input_text": text,
            "source_language": source_language,
            "target_language": target_language,
        }
        
        try:
            # Check if domain-specific translation is requested
            if domain and domain in self.app.domain_adapters["translation"].domain_data:
                # Use domain-adapted model if available
                # In a real implementation, would load domain model
                print(f"Using domain-adapted model for {domain}")
            
            # Process translation
            result = self.app.process_text(text, task="translate")
            
            if "translation" in result:
                response["translation"] = result["translation"]
            
            if "translation_with_cultural_context" in result:
                response["enhanced_translation"] = result["translation_with_cultural_context"]
            
            response["status"] = "success"
            
        except Exception as e:
            response["status"] = "error"
            response["error"] = str(e)
        
        return response
    
    def generate_text(self, prompt, max_length=100, num_return=1, domain=None):
        """API endpoint for text generation."""
        response = {
            "prompt": prompt,
            "max_length": max_length,
            "num_return": num_return
        }
        
        try:
            # Check if domain-specific generation is requested
            if domain and domain in self.app.domain_adapters["generation"].domain_data:
                # Use domain-adapted model if available
                print(f"Using domain-adapted model for {domain}")
            
            # Process generation
            result = self.app.process_text(prompt, task="generate")
            
            if "generated_text" in result:
                response["generated_text"] = result["generated_text"]
            
            if "generated_text_with_cultural_context" in result:
                response["enhanced_generated_text"] = result["generated_text_with_cultural_context"]
            
            response["status"] = "success"
            
        except Exception as e:
            response["status"] = "error"
            response["error"] = str(e)
        
        return response
    
    def analyze_sentiment(self, text, include_probabilities=False, domain=None):
        """API endpoint for sentiment analysis."""
        response = {
            "input_text": text,
            "include_probabilities": include_probabilities
        }
        
        try:
            # Check if domain-specific sentiment analysis is requested
            if domain and domain in self.app.domain_adapters["sentiment"].domain_data:
                # Use domain-adapted model if available
                print(f"Using domain-adapted model for {domain}")
            
            # Process sentiment analysis
            result = self.app.process_text(text, task="sentiment")
            
            if "sentiment_analysis" in result:
                sentiment = result["sentiment_analysis"]
                response["sentiment"] = sentiment["sentiment"]
                response["confidence"] = sentiment["confidence"]
                
                if include_probabilities and "probabilities" in sentiment:
                    response["probabilities"] = sentiment["probabilities"]
            
            if "sentiment_with_cultural_context" in result:
                response["enhanced_sentiment"] = result["sentiment_with_cultural_context"]
            
            response["status"] = "success"
            
        except Exception as e:
            response["status"] = "error"
            response["error"] = str(e)
        
        return response
    
    def process_hinglish(self, text, task="translate"):
        """API endpoint for processing Hinglish text."""
        response = {
            "input_text": text,
            "task": task
        }
        
        try:
            # First preprocess Hinglish
            processed = self.app.hinglish_processor.preprocess_for_nlp_tasks(text)
            response["preprocessed"] = processed
            
            # Then perform the requested task
            result = self.app.process_text(processed["hindi_only"], task)
            
            # Add task-specific results
            if task == "translate" and "translation" in result:
                response["translation"] = result["translation"]
            
            if task == "generate" and "generated_text" in result:
                response["generated_text"] = result["generated_text"]
            
            if task == "sentiment" and "sentiment_analysis" in result:
                response["sentiment"] = result["sentiment_analysis"]
            
            response["status"] = "success"
            
        except Exception as e:
            response["status"] = "error"
            response["error"] = str(e)
        
        return response
    
    def full_pipeline(self, text):
        """API endpoint for running full pipeline on text."""
        response = {
            "input_text": text
        }
        
        try:
            # Run all NLP tasks
            result = self.app.process_text(text, task="all")
            response.update(result)
            response["status"] = "success"
            
        except Exception as e:
            response["status"] = "error"
            response["error"] = str(e)
        
        return response


#################################
# 11. CONFIG AND UTILITIES     #
#################################

class HindiNLPConfig:
    """Configuration class for Hindi NLP Framework."""
    
    DEFAULT_CONFIG = {
        "models": {
            "translation": {
                "path": "ai4bharat/IndicBART",
                "device": "cuda" if torch.cuda.is_available() else "cpu",
                "batch_size": 16
            },
            "generation": {
                "path": "ai4bharat/IndicBART",
                "device": "cuda" if torch.cuda.is_available() else "cpu",
                "batch_size": 8,
                "max_length": 100
            },
            "sentiment": {
                "path": "ai4bharat/MuRIL-base",
                "device": "cuda" if torch.cuda.is_available() else "cpu",
                "batch_size": 32
            }
        },
        "data": {
            "translation": {
                "train": "data/iitb_corpus_train.csv",
                "val": "data/iitb_corpus_val.csv",
                "test": "data/iitb_corpus_test.csv"
            },
            "generation": {
                "train": "data/hindi_generation_train.csv",
                "val": "data/hindi_generation_val.csv",
                "test": "data/hindi_generation_test.csv"
            },
            "sentiment": {
                "train": "data/hindi_sentiment_train.csv",
                "val": "data/hindi_sentiment_val.csv",
                "test": "data/hindi_sentiment_test.csv"
            }
        },
        "domains": ["general", "legal", "technical", "medical", "entertainment"],
        "output_dir": "models/fine_tuned",
        "domain_output_dir": "models/domain_adapted",
        "logging": {
            "level": "INFO",
            "file": "hindi_nlp.log"
        },
        "api": {
            "host": "localhost",
            "port": 8000,
            "debug": True
        }
    }
    
    def __init__(self, config_file=None):
        """Initialize configuration, optionally from file."""
        self.config = self.DEFAULT_CONFIG
        
        if config_file and os.path.exists(config_file):
            self.load_config(config_file)
    
    def load_config(self, config_file):
        """Load configuration from JSON file."""
        try:
            with open(config_file, 'r') as f:
                loaded_config = json.load(f)
                
            # Update config with loaded values
            self._update_dict(self.config, loaded_config)
            print(f"Configuration loaded from {config_file}")
            
        except Exception as e:
            print(f"Error loading configuration: {str(e)}")
    
    def _update_dict(self, d, u):
        """Recursively update nested dictionary."""
        for k, v in u.items():
            if isinstance(v, dict) and k in d and isinstance(d[k], dict):
                self._update_dict(d[k], v)
            else:
                d[k] = v
    
    def save_config(self, config_file):
        """Save current configuration to JSON file."""
        try:
            with open(config_file, 'w') as f:
                json.dump(self.config, f, indent=2)
            print(f"Configuration saved to {config_file}")
            return True
        except Exception as e:
            print(f"Error saving configuration: {str(e)}")
            return False
    
    def get(self, key, default=None):
        """Get configuration value by key path."""
        keys = key.split('.')
        value = self.config
        
        try:
            for k in keys:
                value = value[k]
            return value
        except (KeyError, TypeError):
            return default
    
    def set(self, key, value):
        """Set configuration value by key path."""
        keys = key.split('.')
        d = self.config
        
        for k in keys[:-1]:
            if k not in d or not isinstance(d[k], dict):
                d[k] = {}
            d = d[k]
        
        d[keys[-1]] = value


class HindiNLPLogger:
    """Logging class for Hindi NLP Framework."""
    
    def __init__(self, config):
        """Initialize logger with configuration."""
        self.config = config
        self.logger = self._setup_logger()
    
    def _setup_logger(self):
        """Set up logging configuration."""
        logger = logging.getLogger("HindiNLP")
        
        # Get log level from config
        level_str = self.config.get("logging.level", "INFO")
        level = getattr(logging, level_str)
        
        logger.setLevel(level)
        
        # Create console handler
        console_handler = logging.StreamHandler()
        console_handler.setLevel(level)
        
        # Create file handler if specified
        log_file = self.config.get("logging.file")
        if log_file:
            os.makedirs(os.path.dirname(log_file), exist_ok=True)
            file_handler = logging.FileHandler(log_file)
            file_handler.setLevel(level)
            
            # Create formatter
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
            )
            file_handler.setFormatter(formatter)
            
            # Add handler to logger
            logger.addHandler(file_handler)
        
        # Add console handler
        logger.addHandler(console_handler)
        
        return logger
    
    def info(self, message):
        """Log info message."""
        self.logger.info(message)
    
    def warning(self, message):
        """Log warning message."""
        self.logger.warning(message)
    
    def error(self, message):
        """Log error message."""
        self.logger.error(message)
    
    def debug(self, message):
        """Log debug message."""
        self.logger.debug(message)
    
    def critical(self, message):
        """Log critical message."""
        self.logger.critical(message)


#################################
# 12. COMMAND LINE INTERFACE   #
#################################

class HindiNLPCLI:
    """Command Line Interface for Hindi NLP Framework."""
    
    def __init__(self, app):
        """Initialize CLI with reference to main application."""
        self.app = app
    
    def parse_args(self):
        """Parse command line arguments."""
        parser = argparse.ArgumentParser(description="Hindi NLP Framework CLI")
        
        # Main subparsers
        subparsers = parser.add_subparsers(dest="command", help="Command")
        
        # Translation command
        translate_parser = subparsers.add_parser("translate", help="Translate text")
        translate_parser.add_argument("text", help="Text to translate")
        translate_parser.add_argument("--src", default="en", help="Source language (default: en)")
        translate_parser.add_argument("--tgt", default="hi", help="Target language (default: hi)")
        translate_parser.add_argument("--cultural", action="store_true", help="Use cultural enhancement")
        translate_parser.add_argument("--domain", help="Specify domain for translation")
        
        # Generation command
        generate_parser = subparsers.add_parser("generate", help="Generate text")
        generate_parser.add_argument("prompt", help="Prompt for text generation")
        generate_parser.add_argument("--length", type=int, default=100, help="Maximum length (default: 100)")
        generate_parser.add_argument("--num", type=int, default=1, help="Number of generations (default: 1)")
        generate_parser.add_argument("--cultural", action="store_true", help="Use cultural enhancement")
        generate_parser.add_argument("--domain", help="Specify domain for generation")
        
        # Sentiment command
        sentiment_parser = subparsers.add_parser("sentiment", help="Analyze sentiment")
        sentiment_parser.add_argument("text", help="Text for sentiment analysis")
        sentiment_parser.add_argument("--probs", action="store_true", help="Include class probabilities")
        sentiment_parser.add_argument("--cultural", action="store_true", help="Use cultural enhancement")
        sentiment_parser.add_argument("--domain", help="Specify domain for sentiment analysis")
        
        # Hinglish command
        hinglish_parser = subparsers.add_parser("hinglish", help="Process Hinglish text")
        hinglish_parser.add_argument("text", help="Hinglish text to process")
        hinglish_parser.add_argument("--task", choices=["translate", "generate", "sentiment", "all"],
                                     default="all", help="Task to perform on processed text")
        
        # Batch processing command
        batch_parser = subparsers.add_parser("batch", help="Process batch of texts")
        batch_parser.add_argument("input_file", help="Input file with texts")
        batch_parser.add_argument("output_file", help="Output file for results")
        batch_parser.add_argument("--task", choices=["translate", "generate", "sentiment", "all"],
                                 default="translate", help="Task to perform (default: translate)")
        
        # Demo command
        subparsers.add_parser("demo", help="Run demonstration")
        
        # Initialize models command
        init_parser = subparsers.add_parser("init", help="Initialize models")
        init_parser.add_argument("--translation", help="Translation model path")
        init_parser.add_argument("--generation", help="Generation model path")
        init_parser.add_argument("--sentiment", help="Sentiment model path")
        
        # Fine-tune command
        finetune_parser = subparsers.add_parser("finetune", help="Fine-tune models")
        finetune_parser.add_argument("--models", choices=["all", "translation", "generation", "sentiment"],
                                    default="all", help="Models to fine-tune (default: all)")
        finetune_parser.add_argument("--domains", nargs="+", help="Domains to fine-tune for")
        finetune_parser.add_argument("--output", default="models/fine_tuned", help="Output directory")
        
        # Evaluate command
        evaluate_parser = subparsers.add_parser("evaluate", help="Evaluate models")
        evaluate_parser.add_argument("--models", choices=["all", "translation", "generation", "sentiment"],
                                    default="all", help="Models to evaluate (default: all)")
        evaluate_parser.add_argument("--test-data", help="Test data directory")
        
        return parser.parse_args()
    
    def run(self):
        """Run CLI application."""
        args = self.parse_args()
        
        if args.command == "translate":
            result = self.app.process_text(args.text, task="translate")
            if "translation" in result:
                print(f"Translation: {result['translation']}")
            if args.cultural and "translation_with_cultural_context" in result:
                print(f"Enhanced Translation: {result['translation_with_cultural_context']}")
        
        elif args.command == "generate":
            result = self.app.process_text(args.prompt, task="generate")
            if "generated_text" in result:
                for i, text in enumerate(result["generated_text"]):
                    print(f"Generated Text {i+1}:")
                    print(text)
            if args.cultural and "generated_text_with_cultural_context" in result:
                print(f"Enhanced Generation: {result['generated_text_with_cultural_context']}")
        
        elif args.command == "sentiment":
            result = self.app.process_text(args.text, task="sentiment")
            if "sentiment_analysis" in result:
                sentiment = result["sentiment_analysis"]
                print(f"Sentiment: {sentiment['sentiment']}")
                print(f"Confidence: {sentiment['confidence']:.2f}")
                if args.probs and "probabilities" in sentiment:
                    print("Class Probabilities:")
                    for label, prob in sentiment["probabilities"].items():
                        print(f"  {label}: {prob:.2f}")
            if args.cultural and "sentiment_with_cultural_context" in result:
                print(f"Enhanced Sentiment Analysis: {result['sentiment_with_cultural_context']}")
        
        elif args.command == "hinglish":
            processed = self.app.hinglish_processor.preprocess_for_nlp_tasks(args.text)
            print("Hinglish Processing Results:")
            print(f"Normalized: {processed['normalized']}")
            print(f"Hindi Only: {processed['hindi_only']}")
            print(f"Fully Transliterated: {processed['fully_transliterated']}")
            
            if args.task != "all":
                result = self.app.process_text(processed["hindi_only"], task=args.task)
                if args.task == "translate" and "translation" in result:
                    print(f"\nTranslation: {result['translation']}")
                elif args.task == "generate" and "generated_text" in result:
                    print(f"\nGenerated Text: {result['generated_text'][0]}")
                elif args.task == "sentiment" and "sentiment_analysis" in result:
                    sentiment = result["sentiment_analysis"]
                    print(f"\nSentiment: {sentiment['sentiment']}")
                    print(f"Confidence: {sentiment['confidence']:.2f}")
        
        elif args.command == "batch":
            self.app.run_batch_processing(args.input_file, args.output_file, args.task)
        
        elif args.command == "demo":
            self.app.demo()
        
        elif args.command == "init":
            translation_model = args.translation or "ai4bharat/IndicBART"
            generation_model = args.generation or "ai4bharat/IndicBART"
            sentiment_model = args.sentiment or "ai4bharat/MuRIL-base"
            
            self.app.initialize_models(
                translation_model=translation_model,
                generation_model=generation_model,
                sentiment_model=sentiment_model
            )
        
        elif args.command == "finetune":
            if args.models == "all" or args.models == "translation":
                print("Fine-tuning translation model...")
                # Fine-tuning logic
            
            if args.models == "all" or args.models == "generation":
                print("Fine-tuning generation model...")
                # Fine-tuning logic
            
            if args.models == "all" or args.models == "sentiment":
                print("Fine-tuning sentiment model...")
                # Fine-tuning logic
            
            if args.domains:
                print(f"Fine-tuning for domains: {', '.join(args.domains)}")
                self.app.fine_tune_domain_models(domains=args.domains, output_dir=args.output)
        
        elif args.command == "evaluate":
            print("Evaluating models...")
            # Evaluation logic based on args.models
        
        else:
            print("Please specify a command. Use --help for options.")


#################################
# 13. MAIN EXECUTION           #
#################################

def main():
    """Main function to run the Hindi NLP framework."""
    # Load configuration
    config = HindiNLPConfig("config.json")
    
    # Set up logging
    logger = HindiNLPLogger(config)
    logger.info("Starting Hindi NLP Framework")
    
    # Initialize main application
    app = HindiNLPApp()
    
    # Initialize API
    api = HindiNLPAPI(app)
    
    # Initialize CLI
    cli = HindiNLPCLI(app)
    
    # Parse command line arguments and run
    if len(sys.argv) > 1:
        cli.run()
    else:
        # If no arguments, run demo
        logger.info("No command specified, running demo")
        app.initialize_models()
        app.demo()


if __name__ == "__main__":
    main()

Using device: cpu


[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>


FileNotFoundError: [Errno 2] No such file or directory: ''