In [None]:
from typing import List, Tuple
from lingua import Language, LanguageDetectorBuilder


class LanguageDetector:
    """
    A class for detecting the language of a text using the Lingua library.
    """

    SUPPORTED_LANGUAGES = {"en", "fr", "de"}  # Using set for fast lookup
    DEFAULT_LANGUAGE = "en"

    def __init__(self):
        """
        Initializes the LanguageDetector with a Lingua model for supported languages.
        """
        languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN]
        self.detector = LanguageDetectorBuilder.from_languages(*languages).build()

    def detect_language(self, text: str, k: int = 3, threshold: float = 0.0) -> Tuple[List[str], List[float]]:

        try:
            confidence_values = self.detector.compute_language_confidence_values(text)
        except Exception as e:
            raise Exception(f"An unexpected error occurred during language detection: {e}") from e

        detected_languages = []
        detected_scores = []

        for lang_prob in confidence_values:
            iso_code = lang_prob.language.iso_code_639_1.name.lower()
            if iso_code in self.SUPPORTED_LANGUAGES and lang_prob.value >= threshold:
                detected_languages.append(iso_code)
                detected_scores.append(lang_prob.value)
                if len(detected_languages) == k:
                    break
        if not detected_languages:
            detected_languages.append(self.DEFAULT_LANGUAGE)
            detected_scores.append(1.0)

        return detected_languages, detected_scores

In [46]:
detector = LanguageDetector()
langs, scores = detector.detect_language("report csr", k=1, threshold=0.3)
print("Languages:", langs)
print("Scores:", scores)

Languages: ['en']
Scores: [0.5227425370744224]


In [None]:
# Test complet du détecteur de langue
detector = LanguageDetector ()

test_cases = [
    # Cas simples
    "paiement",
    "payment",
    "zahlung",
    
    # Mots courts ambigus
    "le",
    "the",
    "der",
    
    # Phrases courtes
    "bonjour monde",
    "hello world",
    "hallo welt",
    
    # Mots partagés entre langues
    "merci",  # français mais connu en anglais
    "merci beaucoup",
    "danke",
    "danke schon",
    
    # Textes plus longs
    "Le système de paiement est sécurisé et fiable.",
    "The payment system is secure and reliable.",
    "Das Zahlungssystem ist sicher und zuverlässig.",
    
    # Textes ambigus (mélange de mots)
    "paiement payment",
    "zahlung paiement",
    "le the der",
    
    # Mots techniques
    "facture",
    "invoice",
    "rechnung",
    
    # Textes très courts
    "ok",
    "oui",
    "ja",
    
    # Caractères spéciaux
    "paiement!",
    "zahlung?",
    "payment.",
    
    # Texte vide ou presque
    "",
    "   ",
    "123",
    "!@#",
    
    # Mots composés
    "paiement en ligne",
    "online payment",
    "online-zahlung",
    
    # Texte multilingue
    "Le paiement et the payment sont identiques",
    "Das ist ein test und payment test",
    
    # Fautes d'orthographe
    "payement",  # faute en français
    "paymant",   # faute en anglais
    "zalung",    # faute en allemand
    
    # Nombres et symboles
    "paiement 100 euros",
    "payment $50",
    "zahlung 200€",
    
    # Pronoms et articles
    "le paiement",
    "the payment",
    "die zahlung",
    
    # Verbes conjugués
    "je paie",
    "I pay",
    "ich zahle",
    
    # Texte aléatoire
    "xyz abc def",
    "blabla test",
    "random text here",
    
    # Mots très courants
    "de la",
    "of the",
    "von der",
    
    # Texte avec accents
    "paiment",  # sans ç
    "zahlüng",  # avec ü
    "paîment",  # avec î
]

print("=== TESTS COMPLETS DU DÉTECTEUR DE LANGUE ===\n")

for i, text in enumerate(test_cases, 1):
    try:
        langs, scores = detector.detect_language(text, k=3)
        print(f"Test {i:2d}: '{text}'")
        print(f"  Langues détectées: {langs}")
        print(f"  Scores: {[f'{score:.4f}' for score in scores]}")
        
        # Vérifier que toutes les langues sont dans les langues supportées
        valid_langs = [lang for lang in langs if lang in detector.SUPPORTED_LANGUAGES]
        if len(valid_langs) != len(langs):
            print(f"  ⚠️  Attention: langues non supportées détectées!")
            
        print()
        
    except Exception as e:
        print(f"Test {i:2d}: '{text}'")
        print(f"  ❌ Erreur: {e}\n")

# Tests spécifiques pour vérifier que les 3 langues sont bien détectées
print("=== TESTS SPÉCIFIQUES POUR CHAQUE LANGUE ===\n")

french_tests = [
    "paiement", "facture", "commande", "livraison", "retour",
    "Le client a effectué un paiement sécurisé.",
    "Merci pour votre commande, elle sera livrée demain.",
    "Service client disponible 24h/24."
]

english_tests = [
    "payment", "invoice", "order", "delivery", "return",
    "The customer made a secure payment.",
    "Thank you for your order, it will be delivered tomorrow.",
    "Customer service available 24/7."
]

german_tests = [
    "zahlung", "rechnung", "bestellung", "lieferung", "rückgabe",
    "Der kunde hat eine sichere zahlung durchgeführt.",
    "Vielen dank für ihre bestellung, sie wird morgen geliefert.",
    "Kundenservice rund um die uhr verfügbar."
]

print("Tests français:")
for text in french_tests[:5]:  # Juste quelques exemples
    langs, scores = detector.detect_language(text, k=3)
    print(f"  '{text}' -> {langs} (scores: {[f'{s:.3f}' for s in scores]})")

print("\nTests anglais:")
for text in english_tests[:5]:
    langs, scores = detector.detect_language(text, k=3)
    print(f"  '{text}' -> {langs} (scores: {[f'{s:.3f}' for s in scores]})")

print("\nTests allemand:")
for text in german_tests[:5]:
    langs, scores = detector.detect_language(text, k=3)
    print(f"  '{text}' -> {langs} (scores: {[f'{s:.3f}' for s in scores]})")

# Test des cas limites
print("\n=== TESTS DES CAS LIMITES ===\n")

edge_cases = [
    ("", "Texte vide"),
    ("   ", "Espaces seulement"),
    ("123456", "Nombres seulement"),
    ("!@#$%^&*()", "Symboles seulement"),
    ("a", "Lettre seule"),
    ("I", "Pronom anglais court"),
    ("OK", "Abréviation"),
    ("etc.", "Abréviation avec point"),
]

for text, description in edge_cases:
    langs, scores = detector.detect_language(text, k=3)
    print(f"{description}: '{text}'")
    print(f"  Résultat: {langs} (scores: {[f'{s:.3f}' for s in scores]})")
    if langs[0] == detector.DEFAULT_LANGUAGE and scores[0] == 1.0:
        print(f"  🔄 Utilise la langue par défaut ({detector.DEFAULT_LANGUAGE})")
    print()