In [None]:
# 🌐 Language Detection and Processing
# TODO: Implement multilingual capabilities

from langdetect import detect_langs, LangDetectException
from deep_translator import GoogleTranslator
from sentence_transformers import SentenceTransformer # Added this import
import numpy as np
import networkx as nx # Assuming networkx is used in EntityRelationshipMapper which is called by this class
from collections import defaultdict, Counter


class MultilingualProcessor:
    """
    Advanced multilingual processing with language detection and cultural context.
    """

    def __init__(self):
        # Initialize models and data

        self.embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

        # Idioms and cultural references per language
        self.idioms = {
            'en': ['break the ice', 'spill the beans', 'kick the bucket'],
            'es': ['poner los puntos sobre las íes', 'estar en las nubes'],
            'fr': ['poser un lapin', 'donner sa langue au chat']
        }

        self.regional_keywords = {
            'en': ['New York', 'Wall Street', 'Thanksgiving'],
            'es': ['Madrid', 'La Tomatina', 'Santiago'],
            'fr': ['Paris', 'Bastille', 'Lyon']
        }

        # Historical/social nuances keywords (example)
        self.social_nuances = {
            'en': ['civil rights', 'Black Lives Matter', 'Me Too'],
            'es': ['derechos civiles', 'Black Lives Matter', 'Me Too'],
            'fr': ['droits civiques', 'Black Lives Matter', 'Me Too']
        }

    def detect_language(self, text):
        """
        Detect language with confidence scoring.

        Handles:
        - Multiple languages (best effort)
        - Short texts
        - Code-switching (detects top languages by probability)

        Returns:
            List of (language_code, confidence) tuples sorted by confidence descending.
        """
        try:
            detections = detect_langs(text)
            # Filter out very low confidence languages
            filtered = [(det.lang, det.prob) for det in detections if det.prob > 0.1]
            filtered.sort(key=lambda x: x[1], reverse=True)
            return filtered
        except LangDetectException:
            return [("unknown", 0.0)]


    def translate_text(self, text, target_language='en'):
        """
        Translate text with quality scoring and cultural adaptation hints.

        Returns:
            dict with 'translated_text', 'quality_score', 'cultural_adaptation_notes'
        """

        try:
            translator = GoogleTranslator(source='auto', target=target_language)
            translated = translator.translate(text)


            quality_score = self._assess_translation_quality(text, translated)
            cultural_adaptation_notes = self._detect_cultural_elements(text, target_language)

            return {
                'translated_text': translated,
                'quality_score': quality_score,
                'cultural_adaptation_notes': cultural_adaptation_notes
            }
        except Exception as e:
            return {'error': str(e), 'translated_text': '', 'quality_score': 0.0, 'cultural_adaptation_notes': []}


    def _assess_translation_quality(self, source_text, translated_text):
        """
        Simple heuristic: length ratio, no truncation, presence of common stop words in target language.
        """

        ratio = len(translated_text) / max(len(source_text), 1)
        if not (0.7 <= ratio <= 1.3):
            return 0.5  # suspect translation quality

        # Could be expanded with language-specific stop word check or LM scoring
        return 1.0


    def _detect_cultural_elements(self, text, target_language):
        """
        Check for cultural idioms or references that might need adaptation.
        """

        notes = []

        for lang, idiom_list in self.idioms.items():
            for idiom in idiom_list:
                if idiom in text.lower():
                    notes.append(f"Contains idiom '{idiom}' from {lang} language; consider cultural adaptation.")

        return notes


    def analyze_cross_lingual(self, articles_by_language):
        """
        Compare coverage and perspectives across languages.

        Args:
            articles_by_language: dict(lang_code -> list of article texts)

        Returns:
            dict of (lang1, lang2) -> similarity score (cosine similarity of mean embeddings)
        """

        embeddings_by_lang = {}

        for lang, articles in articles_by_language.items():
            if not articles:
                embeddings_by_lang[lang] = np.zeros(self.embedding_model.get_sentence_embedding_dimension())
                continue
            embeddings = self.embedding_model.encode(articles, convert_to_tensor=True)
            mean_emb = embeddings.mean(axis=0).cpu().numpy()
            embeddings_by_lang[lang] = mean_emb


        lang_codes = list(embeddings_by_lang.keys())
        similarity_matrix = {}

        for i, lang1 in enumerate(lang_codes):
            for j in range(i + 1, len(lang_codes)):
                lang2 = lang_codes[j]
                emb1 = embeddings_by_lang[lang1]
                emb2 = embeddings_by_lang[lang2]
                if np.linalg.norm(emb1) == 0 or np.linalg.norm(emb2) == 0:
                    sim = 0.0
                else:
                    sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
                similarity_matrix[(lang1, lang2)] = sim


        return similarity_matrix

    def extract_cultural_context(self, text, source_language):
        """
        Identify cultural references and context.

        Returns dict with:
        - idioms_found: list
        - regional_references: list
        - historical_context: list
        - social_nuances: list
        """

        text_lower = text.lower()

        idioms_found = [idiom for idiom in self.idioms.get(source_language, [])
                        if idiom in text_lower]

        regional_refs = [region for region in self.regional_keywords.get(source_language, [])
                         if region.lower() in text_lower]

        historical_context = self._extract_historical_context(text_lower)

        social_nuances_found = [nuance for nuance in self.social_nuances.get(source_language, [])
                                if nuance.lower() in text_lower]

        return {
            'idioms_found': idioms_found,
            'regional_references': regional_refs,
            'historical_context': historical_context,
            'social_nuances': social_nuances_found
        }

    def _extract_historical_context(self, text_lower):
        """
        Naive implementation: detect presence of years, famous historical event names.

        This can be improved by using named entity recognition or knowledge bases.
        """

        # Regex for years 1800-2099
        years = re.findall(r'\b(18|19|20)\d{2}\b', text_lower)
        years = list(set(years))

        # Placeholder events (extend with real DB or API)
        events = ['world war', 'cold war', 'industrial revolution', 'french revolution']
        found_events = [event for event in events if event in text_lower]


        return {'years_mentioned': years, 'events_mentioned': found_events}


if __name__ == "__main__":
    processor = MultilingualProcessor()

    # Test detect_language
    text1 = "Bonjour, comment ça va? Hello!"
    print("Language Detection:", processor.detect_language(text1))

    # Test translate_text
    print("Translation:", processor.translate_text("Je suis très content.", target_language='en'))

    # Test cross-lingual analysis
    articles = {
        'en': ["The economy is growing steadily.", "New policies affect markets."],
        'es': ["La economía está creciendo lentamente.", "Nuevas políticas afectan los mercados."],
        'fr': ["L'économie connaît une croissance stable.", "De nouvelles politiques impactent les marchés."]
    }
    print("Cross-lingual Similarity:", processor.analyze_cross_lingual(articles))

    # Test cultural context extraction
    sample_text = "He decided to break the ice at the party in New York during the Cold War."
    print("Cultural Context:", processor.extract_cultural_context(sample_text, 'en'))