# NLP - Scalable, Accurate, Rule-Based pipeline


In [None]:
# ============================================================
# üß© Mercury App Configuration
# ============================================================
import mercury as mr

app = mr.App(
    title="Advanced NLP Text Classification Dashboard - Multilingual",
    description="Upload CSV or Excel files to classify transcripts with intelligent language detection, translation, and sentiment analysis."
)

file = mr.File(label="üìÅ Upload Dataset (.csv or .xlsx)")
enable_translation = mr.Checkbox(label="üåç Enable Automatic Translation", value=True)
translation_confidence = mr.Slider(label="üéØ Translation Confidence Threshold", value=0.7, min=0.0, max=1.0, step=0.05)
run_button = mr.Button(label="üöÄ Run NLP Pipeline")


# ============================================================
# üì¶ Imports & Configuration
# ============================================================
import os, re, time, warnings, html, unicodedata
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from textblob import TextBlob
from afinn import Afinn
from langdetect import detect, detect_langs, DetectorFactory, LangDetectException
from deep_translator import GoogleTranslator
from typing import Dict, Tuple
from collections import Counter

warnings.filterwarnings("ignore")
DetectorFactory.seed = 0
af = Afinn()

# Configuration
NUM_THREADS = 8
MIN_TEXT_LENGTH = 3
MIN_TRANSLATION_LENGTH = 3
SENTIMENT_WEIGHTS = {'textblob': 0.6, 'afinn': 0.4}


# ============================================================
# üìö Enhanced Category and Subcategory Keywords
# ============================================================
TOPIC_KEYWORDS = {
    "login issue": [
        "login", "log in", "sign in", "sign-in", "signin", "sign out", "sign-out", "signout",
        "password", "forgot password", "reset password", "authentication",
        "verify account", "verification code", "2fa", "two-factor", "two factor",
        "unable to access account", "can't log in", "cannot login"
    ],
    "account issue": [
        "account", "profile", "username", "display name",
        "linked account", "merge account", "multiple accounts",
        "email change", "update details", "account disabled",
        "account locked", "deactivate account", "delete account"
    ],
    "playback issue": [
        "playback", "stream", "music not playing", "song not playing",
        "track skipped", "buffering", "lag", "pause", "stuck",
        "stops suddenly", "won't play", "audio issue", "no sound",
        "silence", "volume problem", "audio quality"
    ],
    "device issue": [
        "bluetooth", "speaker", "carplay", "android auto", "smart tv",
        "echo", "alexa", "chromecast", "airplay", "headphones",
        "device not showing", "device disconnected", "connection issue"
    ],
    "content restriction": [
        "song not available", "track unavailable", "region restriction",
        "country restriction", "not licensed", "greyed out", "removed song",
        "can't find song", "missing track"
    ],
    "ad issue": [
        "ads", "advertisement", "too many ads", "ad volume",
        "ad playing", "premium ads", "commercials", "ad frequency"
    ],
    "recommendation issue": [
        "recommendations", "discover weekly", "radio", "algorithm",
        "curated", "autoplay", "song suggestions", "not relevant",
        "bad recommendations"
    ],
    "ui issue": [
        "interface", "layout", "design", "dark mode", "theme",
        "buttons not working", "search not working", "filter not working",
        "navigation", "menu"
    ],
    "general feedback": [
        "suggestion", "feedback", "recommend", "love spotify",
        "like app", "app improvement", "feature request", "enhancement"
    ],
    "network failure": [
        "network", "connectivity", "internet", "server",
        "connection failed", "offline", "not connecting",
        "spotify down", "timeout", "dns", "proxy", "vpn"
    ],
    "app crash": [
        "crash", "crashed", "app closed", "stopped working", "freeze",
        "freezing", "hang", "bug", "error message", "glitch",
        "unresponsive", "not responding"
    ],
    "performance issue": [
        "slow", "lag", "delay", "performance", "loading", "slow loading",
        "takes forever", "laggy"
    ],
    "data sync issue": [
        "sync", "not syncing", "listening history", "recently played",
        "activity feed", "spotify connect", "data lost", "missing data",
        "playlist not syncing"
    ],
    "subscription issue": [
        "subscription", "plan", "premium", "cancel", "renew",
        "billing", "charged", "payment", "refund", "invoice",
        "upgrade", "downgrade", "free trial", "family plan",
        "student plan", "gift card", "promo code", "spotify wrapped",
        "card", "payment failed"
    ],
}

SUBCATEGORY_KEYWORDS = {
    "subscription issue": {
        "payment": ["refund", "charged", "billing", "invoice", "payment", "payment failed", "card declined"],
        "cancel": ["cancel", "unsubscribe", "stop subscription", "end subscription"],
        "upgrade": ["upgrade", "family plan", "student plan", "premium", "switch plan"],
    },
    "account issue": {
        "login": ["login", "password", "signin", "sign in", "authentication"],
        "profile": ["profile", "email", "username", "display name", "account settings"],
    },
    "device issue": {
        "mobile": ["phone", "android", "iphone", "ios", "mobile app"],
        "car": ["carplay", "android auto", "car", "vehicle"],
        "smart_device": ["alexa", "echo", "chromecast", "smart tv", "airplay"],
    },
}

# Enhanced sentiment keywords
SENTIMENT_KEYWORDS = {
    "very_negative": [
        "bad", "poor", "terrible", "awful", "worst", "horrible", "pathetic",
        "disappointing", "useless", "waste", "boring", "dull", "confusing",
        "frustrating", "annoying", "unpleasant", "uncomfortable"
    ],
    "moderate_negative": [
        "cold", "asleep", "feeling asleep", "sleepy", "tired", "exhausting",
        "no equipment", "no tools", "lack of", "missing", "insufficient"
    ],
    "negative_phrases": [
        "room was cold", "cold room", "feeling asleep",
        "no equipment", "had to write everything", "lack of equipment"
    ],
    "positive": [
        "very good", "excellent", "engaging", "superb", "amazing",
        "all good", "good going", "overall nice", "good", "interactive",
        "mind blowing", "service", "experience", "overall good",
        "everything good", "rocking", "everything is fine",
        "informative", "helpful", "great", "wonderful", "fantastic", "awesome", "outstanding",
        "brilliant", "impressive", "valuable", "useful", "beneficial", "effective",
        "satisfactory", "satisfied", "pleased", "happy", "enjoy", "enjoyed", "loved", "perfect"
    ],
    "neutral_phrases": [
        "nothing else", "no additional comments", "no comments", "nothing",
        "nothing to add", "no comment", "none so far", "no other comment", "nothing more",
        "no more comments", "nothing in specific", "nothing specific", "nothing in particular"
    ],
    "meaningless_patterns": [
        r"^[a-zA-Z]$", r"^[0-9]+$",
        r"^(na|n/a|n\.a|n\|a|n\\a|n\?a|ma|n\./a|n-a)$",
        r"^(nil|none|non|nope)$",
        r"^(ok|okay|yes|no|y|n)$",
        r"^[^\w\s]+$",
        r"^(.)\1+$"
    ]
}

# Pre-compile regex patterns
CONSUMER_PATTERN_PRIMARY = re.compile(
    r"(?i)Consumer:\s*(.*?)(?=\s*\|\s*\d{4}-\d{2}-\d{2}|$|\s*\|\s*Agent:)",
    re.IGNORECASE
)
CONSUMER_PATTERN_FALLBACK = re.compile(
    r"(?i)Consumer:\s*(.*?)(?=\||$)",
    re.IGNORECASE
)

# Text cleaning regex patterns
URL_RE = re.compile(r"https?://\S+|www\.\S+")
EMAIL_RE = re.compile(r"\S+@\S+\.\S+")
HTML_TAG_RE = re.compile(r"<[^>]+>")
MULTI_WS_RE = re.compile(r"\s+")
REPEATED_PUNCT_RE = re.compile(r"([!?.,])\1{1,}")
NON_PRINTABLE_RE = re.compile(r"[\x00-\x1f\x7f-\x9f]")
EMOJI_RE = re.compile(
    "["
    "\U0001F300-\U0001F6FF"
    "\U0001F900-\U0001F9FF"
    "\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF"
    "]+", flags=re.UNICODE
)


# ============================================================
# üßπ Enhanced Text Cleaning
# ============================================================

def clean_text(raw):
    """Enhanced text cleaning with better normalization"""
    if raw is None or pd.isna(raw):
        return ""

    s = str(raw).strip()

    # Handle common encoding issues
    s = s.replace('√É¬±', '√±').replace('√É¬°', '√°').replace('√É¬©', '√©')
    s = s.replace('√É¬≠', '√≠').replace('√É¬≥', '√≥').replace('√É¬∫', '√∫')

    s = html.unescape(s)
    s = unicodedata.normalize("NFKC", s)

    s = URL_RE.sub(" ", s)
    s = EMAIL_RE.sub(" ", s)
    s = HTML_TAG_RE.sub(" ", s)
    s = EMOJI_RE.sub(" ", s)
    s = NON_PRINTABLE_RE.sub(" ", s)
    s = REPEATED_PUNCT_RE.sub(lambda m: m.group(1), s)

    s = s.strip()
    s = MULTI_WS_RE.sub(" ", s)
    s = re.sub(r"^[^\w']+|[^\w']+$", "", s)

    return s.strip()


# ============================================================
# üîç Intelligent Meaningless Text Detection
# ============================================================

def is_meaningless_text(text):
    """
    Advanced detection of meaningless text using multiple heuristics.
    Returns (is_meaningless: bool, reason: str)
    """
    if not text or not text.strip():
        return True, "empty"

    text = text.strip()
    t_lower = text.lower()

    # Normalize variations for comparison
    t_normalized = t_lower.replace("/", "").replace("\\", "").replace("|", "")
    t_normalized = t_normalized.replace(".", "").replace("-", "").replace("?", "")
    t_normalized = t_normalized.replace(" ", "").replace("_", "")

    # Check normalized against meaningless patterns
    meaningless_normalized = ["na", "ma", "nil", "none", "non", "ok", "yes", "no"]
    if t_normalized in meaningless_normalized:
        return True, f"meaningless_normalized: {t_normalized}"

    # Check against meaningless patterns
    for pattern in SENTIMENT_KEYWORDS["meaningless_patterns"]:
        if re.match(pattern, t_lower, re.IGNORECASE):
            return True, f"matched_pattern: {pattern}"

    # Single character (except 'i' or 'a' which could be meaningful)
    if len(text) == 1 and text.lower() not in ['i', 'a']:
        return True, "single_character"

    # Only punctuation or special characters
    if not any(c.isalnum() for c in text):
        return True, "no_alphanumeric"

    # Very short and no meaningful words
    if len(text) <= 4:
        meaningless_short = ['na', 'n/a', 'na.', 'nil', 'ok', 'no', 'yes', 'g', 'ma']
        if t_lower in meaningless_short or t_normalized in meaningless_short:
            return True, "short_meaningless"

    # Check word count
    words = re.findall(r"\b[a-zA-Z]+\b", text)
    if len(words) == 0:
        return True, "no_words"

    # Repeated same word
    if len(words) > 1 and len(set(words)) == 1:
        return True, "repeated_word"

    # High ratio of numbers to text
    num_count = sum(c.isdigit() for c in text)
    if len(text) > 0 and num_count / len(text) > 0.5:
        return True, "mostly_numbers"

    return False, "meaningful"


# ============================================================
# üåç Advanced Language Detection
# ============================================================

def detect_language_with_confidence(text):
    """
    Enhanced language detection with confidence scoring.
    Returns (language_code, confidence_score)
    """
    if not text or len(text) < 3:
        return "en", 1.0

    try:
        # Spanish indicators check FIRST
        spanish_indicators = [
            'tengo', 'nada', 'mas', 'que', 'agregar', 'muy', 'esta', 'dia',
            'gracias', 'como', 'mejorar', 'estoy', 'trabajo', 'comentarios',
            'adicionales', 'cada', 'aprendo', 'agradezco', 'tiempo', 'dedicacion',
            'ningun', 'fue', 'excelente', 'sesion'
        ]

        spanish_chars = ['√°', '√©', '√≠', '√≥', '√∫', '√±', '√º', '√É', '√É¬±', '√É¬°', '√É¬©', '√É¬≠', '√É¬≥', '√É¬∫']
        has_spanish_chars = any(char in text for char in spanish_chars)

        text_lower = text.lower()
        words_in_text = set(re.findall(r'\b\w+\b', text_lower))

        spanish_word_matches = len(words_in_text.intersection(spanish_indicators))

        if has_spanish_chars or spanish_word_matches >= 2:
            return "es", 0.95

        # Check other Romance languages
        romance_indicators = {
            'fr': ['je', 'tu', 'est', 'avec', 'pour', 'dans', 'mais', 'bien', 'tr√®s', 'merci'],
            'pt': ['eu', 'voc√™', 'est√°', 'para', 'com', 'mas', 'bem', 'muito', 'obrigado'],
            'it': ['io', 'tu', 'con', 'per', 'ma', 'bene', 'molto', 'grazie']
        }

        for lang_code, indicators in romance_indicators.items():
            lang_matches = len(words_in_text.intersection(indicators))
            if lang_matches >= 2:
                return lang_code, 0.90

        # English indicators
        common_english_words = [
            'the', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'will', 'would',
            'could', 'should', 'can', 'may', 'must', 'do', 'does', 'did',
            'a', 'an', 'and', 'or', 'but', 'if', 'then', 'than',
            'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'
        ]

        english_word_matches = len(words_in_text.intersection(common_english_words))

        if english_word_matches >= 3:
            return "en", 0.99

        if english_word_matches >= 2 and len(text.split()) <= 15:
            return "en", 0.95

        # ASCII ratio check
        ascii_ratio = sum(ord(c) < 128 for c in text) / len(text)
        if ascii_ratio > 0.95 and english_word_matches >= 1 and spanish_word_matches == 0:
            return "en", 0.90

        # Get language probabilities from langdetect
        lang_probs = detect_langs(text)
        top_lang = lang_probs[0]
        lang_code = top_lang.lang
        confidence = top_lang.prob

        # Override if strong English indicators
        if lang_code not in ['en', 'es', 'fr', 'pt', 'it'] and english_word_matches >= 2:
            return "en", 0.85

        return lang_code, confidence

    except LangDetectException:
        if text:
            ascii_ratio = sum(ord(c) < 128 for c in text) / len(text)
            if ascii_ratio > 0.9:
                return "en", 0.6
        return "unknown", 0.0
    except Exception as e:
        return "unknown", 0.0


# ============================================================
# üîÑ Intelligent Translation
# ============================================================

def smart_translate(text, lang_code, confidence, threshold):
    """
    Intelligent translation with confidence-based decisions.
    Returns (translated_text, was_translated, translation_info)
    """
    if lang_code == 'en':
        return text, False, "already_english"

    # Romance languages use lower threshold
    if lang_code in ['es', 'fr', 'pt', 'it']:
        if confidence < 0.5:
            return text, False, f"low_confidence_{confidence:.2f}"
    else:
        if confidence < threshold:
            return text, False, f"low_confidence_{confidence:.2f}"

    if len(text) < MIN_TRANSLATION_LENGTH:
        short_translatable = ['gracias', 'merci', 'obrigado', 'grazie', 'si', 'oui', 'sim']
        if text.lower() not in short_translatable:
            return text, False, "too_short"

    # Check for English words (skip for Romance languages)
    if lang_code not in ['es', 'fr', 'pt', 'it']:
        common_english_words = [
            'the', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'will',
            'i', 'you', 'he', 'she', 'it', 'we', 'they',
            'not', 'more', 'all', 'some', 'any', 'learned', 'implementing'
        ]

        text_lower = text.lower()
        words_in_text = set(re.findall(r'\b\w+\b', text_lower))
        english_word_matches = len(words_in_text.intersection(common_english_words))

        if english_word_matches >= 2:
            return text, False, f"likely_english_{english_word_matches}_matches"

    # Attempt translation
    try:
        translator = GoogleTranslator(source='auto', target='en')
        translated = translator.translate(text)

        # Check similarity (skip for Romance languages)
        if lang_code not in ['es', 'fr', 'pt', 'it']:
            similarity_ratio = sum(a == b for a, b in zip(text.lower(), translated.lower())) / max(len(text), len(translated))
            if similarity_ratio > 0.8:
                return text, False, f"too_similar_{similarity_ratio:.2f}"

        return translated, True, f"success_{lang_code}"
    except Exception as e:
        return text, False, f"failed_{str(e)[:30]}"


def extract_consumer_text(transcript: str) -> str:
    """Extract consumer text from transcript."""
    if not isinstance(transcript, str):
        return ""

    parts = CONSUMER_PATTERN_PRIMARY.findall(transcript + " ")
    if not parts:
        parts = CONSUMER_PATTERN_FALLBACK.findall(transcript + "|")

    return " ".join(p.strip() for p in parts if p.strip())


# ============================================================
# üí≠ Enhanced Sentiment Classification
# ============================================================

def has_positive_override(text):
    """Detect positive sentiment despite negative-sounding start"""
    if not text:
        return False

    t = text.lower().strip()

    positive_override_patterns = [
        r"nothing.*(?:happy|good|great|excellent|awesome|perfect|love|amazing)",
        r"no.*(?:complain|issue|problem).*(?:good|great|excellent|awesome|perfect)",
    ]

    for pattern in positive_override_patterns:
        if re.search(pattern, t):
            return True

    if t.startswith(("nothing", "no other", "no ")):
        positive_indicators = [
            "happy", "good", "great", "excellent", "awesome", "perfect",
            "amazing", "wonderful", "fantastic", "love", "enjoyed", "helpful"
        ]
        if any(indicator in t for indicator in positive_indicators):
            return True

    return False


def classify_sentiment_with_confidence(text):
    """
    Enhanced sentiment classification with confidence scoring.
    Returns (sentiment, confidence_score)
    """
    if not text or not text.strip():
        return "", 0.0

    t_lower = text.lower().strip()

    if is_meaningless_text(text)[0]:
        return "", 1.0

    # Check neutral phrases first
    for phrase in SENTIMENT_KEYWORDS.get("neutral_phrases", []):
        if phrase in t_lower:
            strong_positive = ["excellent", "amazing", "wonderful", "fantastic", "awesome", "loved", "perfect"]
            strong_negative = ["terrible", "awful", "horrible", "worst", "pathetic", "useless", "waste"]

            has_strong_positive = any(word in t_lower for word in strong_positive)
            has_strong_negative = any(word in t_lower for word in strong_negative)

            if not has_strong_positive and not has_strong_negative:
                return "neutral", 0.85

    # Negative detection
    for phrase in SENTIMENT_KEYWORDS.get("negative_phrases", []):
        if phrase.lower() in t_lower:
            return "negative", 0.9

    for keyword in SENTIMENT_KEYWORDS["very_negative"]:
        if keyword.lower() in t_lower:
            return "negative", 0.85

    for keyword in SENTIMENT_KEYWORDS.get("moderate_negative", []):
        if keyword.lower() in t_lower:
            return "negative", 0.75

    # Positive detection
    if has_positive_override(text):
        return "positive", 0.9

    for keyword in SENTIMENT_KEYWORDS["positive"]:
        if keyword.lower() in t_lower:
            return "positive", 0.85

    # Hybrid scoring
    try:
        tb_score = TextBlob(text).sentiment.polarity
        af_score = af.score(text) / 5.0
        combined_score = 0.6 * tb_score + 0.4 * af_score

        confidence = min(abs(combined_score) * 2, 0.9)

        if combined_score <= -0.15:
            return "negative", confidence
        elif combined_score >= 0.15:
            return "positive", confidence
        else:
            return "neutral", max(confidence, 0.6)
    except:
        return "neutral", 0.3


def predict_category(text: str) -> Tuple[str, int]:
    """Predict category with confidence score."""
    text_lower = text.lower()
    best_match, best_score = "", 0

    for category, keywords in TOPIC_KEYWORDS.items():
        score = 0
        for keyword in keywords:
            if len(keyword.split()) == 1 and len(keyword) <= 3:
                pattern = r'\b' + re.escape(keyword) + r'\b'
                if re.search(pattern, text_lower):
                    score += 1
            else:
                if keyword in text_lower:
                    score += 1.5 if ' ' in keyword else 1

        if score > best_score:
            best_score = score
            best_match = category

    return (best_match if best_score > 0 else "", int(best_score))


def predict_subcategory(category: str, text: str) -> Tuple[str, int]:
    """Predict subcategory based on category."""
    if not category or category not in SUBCATEGORY_KEYWORDS:
        return ("", 0)

    text_lower = text.lower()
    best_match, best_score = "", 0

    for subcategory, keywords in SUBCATEGORY_KEYWORDS[category].items():
        score = 0
        for keyword in keywords:
            if len(keyword.split()) == 1 and len(keyword) <= 3:
                pattern = r'\b' + re.escape(keyword) + r'\b'
                if re.search(pattern, text_lower):
                    score += 1
            else:
                if keyword in text_lower:
                    score += 1.5 if ' ' in keyword else 1

        if score > best_score:
            best_score = score
            best_match = subcategory

    return (best_match, int(best_score))


def apply_rules(text: str, preds: Dict) -> Dict:
    """Apply rule-based overrides."""
    text_lower = text.lower()

    if any(k in text_lower for k in ["refund", "charged", "billing", "payment failed"]):
        preds["category"] = "subscription issue"
        preds["subcategory"] = "payment"
        if "refund" in text_lower or "charged" in text_lower:
            preds["sentiment"] = "negative"
    elif "cancel" in text_lower and "subscription" in text_lower:
        preds["category"] = "subscription issue"
        preds["subcategory"] = "cancel"

    return preds


# ============================================================
# üîÑ Core Row Processing
# ============================================================

def process_row(row: Dict, translation_enabled: bool, trans_threshold: float) -> Dict:
    """Process a single transcript row with comprehensive analysis."""
    conversation_id = row.get("Conversation Id", "")
    transcript = str(row.get("transcripts", ""))
    consumer_text = extract_consumer_text(transcript)

    # Clean text
    cleaned_text = clean_text(consumer_text)

    # Handle empty text
    if not cleaned_text.strip():
        return {
            "Conversation Id": conversation_id,
            "Consumer_Text": consumer_text,
            "Cleaned_Text": cleaned_text,
            "Translated_Text": "",
            "Category": "",
            "Subcategory": "",
            "Sentiment": "",
        }

    # Check if meaningless
    is_meaningless, reason = is_meaningless_text(cleaned_text)
    if is_meaningless:
        return {
            "Conversation Id": conversation_id,
            "Consumer_Text": consumer_text,
            "Cleaned_Text": cleaned_text,
            "Translated_Text": "",
            "Category": "",
            "Subcategory": "",
            "Sentiment": "",
        }

    # Detect language
    lang_code, lang_confidence = detect_language_with_confidence(cleaned_text)

    # Initialize translation fields
    translated_text = ""
    text_for_analysis = cleaned_text

    # Translate if needed
    if translation_enabled and lang_code != 'en':
        translated, was_translated, trans_info = smart_translate(cleaned_text, lang_code, lang_confidence, trans_threshold)
        if was_translated:
            translated_text = translated
            text_for_analysis = translated

    # Handle unknown or very short text
    if lang_code == 'unknown' or len(text_for_analysis.split()) < MIN_TEXT_LENGTH:
        return {
            "Conversation Id": conversation_id,
            "Consumer_Text": consumer_text,
            "Cleaned_Text": cleaned_text,
            "Translated_Text": translated_text,
            "Category": "",
            "Subcategory": "",
            "Sentiment": "",
        }

    # Predict category and subcategory
    category, cat_confidence = predict_category(text_for_analysis)
    subcategory, subcat_confidence = predict_subcategory(category, text_for_analysis)
    sentiment, sent_confidence = classify_sentiment_with_confidence(text_for_analysis)

    # Build predictions
    preds = {
        "category": category,
        "subcategory": subcategory,
        "sentiment": sentiment,
    }

    # Apply rules
    preds = apply_rules(text_for_analysis, preds)

    return {
        "Conversation Id": conversation_id,
        "Consumer_Text": consumer_text,
        "Cleaned_Text": cleaned_text,
        "Translated_Text": translated_text,
        "Category": preds["category"],
        "Subcategory": preds["subcategory"],
        "Sentiment": preds["sentiment"],
    }


# ============================================================
# üßÆ Run NLP Pipeline
# ============================================================

def run_pipeline(uploaded_file, translation_enabled, trans_threshold):
    start = time.time()

    file_path = uploaded_file.filepath
    original_name = getattr(uploaded_file, 'filename', '') or getattr(uploaded_file, 'name', '')

    # Load dataset
    df = None
    error_msg = ""

    if original_name.lower().endswith('.xlsx'):
        try:
            df = pd.read_excel(file_path)
        except Exception as e:
            error_msg = f"Excel read error: {str(e)}"
    elif original_name.lower().endswith('.csv'):
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            error_msg = f"CSV read error: {str(e)}"

    if df is None:
        try:
            df = pd.read_excel(file_path)
        except:
            try:
                df = pd.read_csv(file_path)
            except Exception as e:
                raise ValueError(f"Could not read file. {error_msg}. Last attempt: {str(e)}")

    if df is None:
        raise ValueError("Unsupported file format. Upload a .csv or .xlsx file.")

    if "Conversation Id" not in df.columns or "transcripts" not in df.columns:
        raise ValueError("Input file must contain 'Conversation Id' and 'transcripts' columns.")

    rows = df.to_dict("records")
    total_rows = len(rows)

    results = []
    processed = 0

    # Parallel processing
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        future_to_row = {executor.submit(process_row, row, translation_enabled, trans_threshold): row for row in rows}

        for future in as_completed(future_to_row):
            results.append(future.result())
            processed += 1

            if processed % max(1, total_rows // 10) == 0:
                progress_pct = (processed / total_rows) * 100
                print(f"‚è≥ Progress: {processed:,}/{total_rows:,} ({progress_pct:.1f}%)")

    # Create output dataframe
    out_df = pd.DataFrame(results)

    # Save to file
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    output_filename = f"sentiment_output_{timestamp}.csv"
    output_path = os.path.abspath(output_filename)
    out_df.to_csv(output_path, index=False)

    elapsed = time.time() - start
    print(f"‚úÖ Completed in {elapsed:.2f}s. Processed {len(out_df)} rows.")

    # Calculate statistics
    stats = {
        'total_rows': total_rows,
        'valid_categories': (out_df['Category'] != '').sum(),
        'valid_subcategories': (out_df['Subcategory'] != '').sum(),
        'valid_sentiments': (out_df['Sentiment'] != '').sum(),
        'translated_count': (out_df['Translated_Text'] != '').sum(),
        'meaningless_count': ((out_df['Category'] == '') & (out_df['Sentiment'] == '')).sum(),
        'elapsed_time': elapsed,
    }

    return out_df, output_filename, output_path, stats


# ============================================================
# üñ•Ô∏è Execute & Display Results
# ============================================================
if run_button.clicked:
    if file is None or not hasattr(file, 'filepath') or file.filepath is None:
        mr.Markdown("### ‚ö†Ô∏è Please upload a .csv or .xlsx file before running the pipeline.")
    else:
        try:
            mr.Markdown("### üîÑ Processing...")
            mr.Markdown(f"**Translation enabled:** {'Yes ‚úì' if enable_translation.value else 'No ‚úó'}")
            mr.Markdown(f"**Translation confidence threshold:** {translation_confidence.value:.2f}")

            df_result, output_filename, output_path, stats = run_pipeline(
                file,
                enable_translation.value,
                translation_confidence.value
            )

            mr.Markdown("### ‚úÖ Processing Complete!")
            mr.Markdown(f"**Total rows processed:** {stats['total_rows']:,}")
            mr.Markdown(f"**Processing time:** {stats['elapsed_time']:.2f} seconds")
            mr.Markdown(f"**Processing speed:** {stats['total_rows']/stats['elapsed_time']:.1f} rows/sec")

            # Statistics
            mr.Markdown("---")
            mr.Markdown("### üìä Classification Statistics")
            mr.Markdown(f"**Valid Category Classifications:** {stats['valid_categories']:,} ({stats['valid_categories']/stats['total_rows']*100:.1f}%)")
            mr.Markdown(f"**Valid Subcategory Classifications:** {stats['valid_subcategories']:,} ({stats['valid_subcategories']/stats['total_rows']*100:.1f}%)")
            mr.Markdown(f"**Valid Sentiment Classifications:** {stats['valid_sentiments']:,} ({stats['valid_sentiments']/stats['total_rows']*100:.1f}%)")
            mr.Markdown(f"**Meaningless/Filtered Texts:** {stats['meaningless_count']:,} ({stats['meaningless_count']/stats['total_rows']*100:.1f}%)")

            if enable_translation.value:
                mr.Markdown(f"**Translated Texts:** {stats['translated_count']:,} ({stats['translated_count']/stats['total_rows']*100:.1f}%)")

            # Category distribution
            mr.Markdown("---")
            mr.Markdown("### üìÇ Top Categories")
            category_counts = df_result[df_result['Category'] != '']['Category'].value_counts().head(5)
            for category, count in category_counts.items():
                mr.Markdown(f"- **{category}**: {count:,} ({count/stats['total_rows']*100:.1f}%)")

            # Sentiment distribution
            mr.Markdown("---")
            mr.Markdown("### üí≠ Sentiment Distribution")
            sentiment_counts = df_result['Sentiment'].value_counts()
            for sentiment in ['positive', 'neutral', 'negative', '']:
                count = sentiment_counts.get(sentiment, 0)
                label = sentiment.upper() if sentiment else "BLANK (meaningless)"
                mr.Markdown(f"- **{label}**: {count:,} ({count/stats['total_rows']*100:.1f}%)")

            # Sample comments
            mr.Markdown("---")
            mr.Markdown("### üìù Sample Comments by Sentiment")
            for sentiment in ['positive', 'neutral', 'negative']:
                sample = df_result[df_result['Sentiment'] == sentiment].head(2)
                if not sample.empty:
                    mr.Markdown(f"**{sentiment.upper()}:**")
                    for _, row in sample.iterrows():
                        text = row['Translated_Text'] if row['Translated_Text'] else row['Cleaned_Text']
                        trans_marker = " [Translated]" if row['Translated_Text'] else ""
                        mr.Markdown(f"- {text[:100]}...{trans_marker}")

            # Display results preview
            mr.Markdown("---")
            mr.Markdown("### üìä Results Preview (First 50 rows)")
            df_result.head(50)

            # Download instructions
            mr.Markdown("---")
            mr.Markdown("### üì• Your Results are Ready!")
            mr.Markdown(f"**‚úÖ File successfully created:** `{output_filename}`")
            mr.Markdown("")
            mr.Markdown("### üîΩ How to Download:")
            mr.Markdown("**File Location:**")
            mr.Markdown(f"```\n{output_path}\n```")
            mr.Markdown("")
            mr.Markdown("**Download Options:**")
            mr.Markdown("1. Navigate to the file location on your server")
            mr.Markdown("2. Use your working directory if running Mercury locally")
            mr.Markdown("3. Use FTP/file browser to download from server")

        except Exception as e:
            mr.Markdown(f"### ‚ùå Error processing file")
            mr.Markdown(f"**Error details:** {str(e)}")
            import traceback
            mr.Markdown(f"```\n{traceback.format_exc()}\n```")
else:
    mr.Markdown("### üëã Welcome to Advanced NLP Text Classification Dashboard")
    mr.Markdown("**‚ú® ADVANCED FEATURES:**")
    mr.Markdown("- üåç **100+ languages** supported with intelligent detection")
    mr.Markdown("- üßπ **Enhanced text cleaning** with encoding normalization")
    mr.Markdown("- üîç **Intelligent meaningless text filtering** (N/A, nil, etc.)")
    mr.Markdown("- üîÑ **Confidence-based translation** with adjustable threshold")
    mr.Markdown("- üí≠ **Advanced sentiment analysis** with multiple methods")
    mr.Markdown("- üìä **Comprehensive statistics** and sample results")
    mr.Markdown("")
    mr.Markdown("### üìã How to Use:")
    mr.Markdown("1. Upload your CSV or Excel file containing 'Conversation Id' and 'transcripts' columns")
    mr.Markdown("2. Enable/disable automatic translation")
    mr.Markdown("3. Adjust translation confidence threshold (0.0-1.0)")
    mr.Markdown("   - Higher = more selective (only translate high-confidence detections)")
    mr.Markdown("   - Lower = more aggressive (translate more texts)")
    mr.Markdown("4. Click 'Run NLP Pipeline' to start processing")
    mr.Markdown("")
    mr.Markdown("### üì¶ Required Packages:")
    mr.Markdown("```bash")
    mr.Markdown("pip install textblob afinn langdetect deep-translator pandas openpyxl mercury")
    mr.Markdown("```")
    mr.Markdown("")
    mr.Markdown("### üì§ Output Columns:")
    mr.Markdown("- **Conversation Id**: Original identifier")
    mr.Markdown("- **Consumer_Text**: Original extracted text")
    mr.Markdown("- **Cleaned_Text**: Normalized and cleaned text")
    mr.Markdown("- **Translated_Text**: English translation (blank if English)")
    mr.Markdown("- **Category**: Main category classification")
    mr.Markdown("- **Subcategory**: Specific subcategory")
    mr.Markdown("- **Sentiment**: positive / negative / neutral / blank")