# NLP - Scalable, Accurate, Rule-Based pipeline


In [None]:
# ============================================================
# üß© Mercury App Configuration
# ============================================================
import mercury as mr

app = mr.App(
    title="NLP Text Classification Dashboard",
    description="Upload CSV or Excel files to classify transcripts using NLP."
)

file = mr.File(label="üìÅ Upload Dataset (.csv or .xlsx)")
run_button = mr.Button(label="üöÄ Run NLP Pipeline")


# ============================================================
# üì¶ Imports & Configuration
# ============================================================
import os, re, time, warnings
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from textblob import TextBlob
from afinn import Afinn
from langdetect import detect, DetectorFactory
import spacy

warnings.filterwarnings("ignore")
DetectorFactory.seed = 0
af = Afinn()

# Load spacy model with error handling
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])
except OSError:
    print("‚ö†Ô∏è spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    nlp = None

NUM_THREADS = 8  # Adjust based on CPU cores


# ============================================================
# üìö Category and Subcategory Keywords
# ============================================================
TOPIC_KEYWORDS = {
    "login issue": ["login", "log in", "sign in", "sign-in", "sign out", "sign-out",
                    "password", "forgot password", "reset password", "authentication",
                    "verify account", "verification code", "2fa", "two-factor",
                    "unable to access account"],
    "account issue": ["account", "profile", "username", "display name",
                      "linked account", "merge account", "multiple accounts",
                      "email change", "update details", "account disabled",
                      "account locked", "deactivate account"],
    "playback issue": ["playback", "stream", "music not playing", "song not playing",
                       "track skipped", "buffering", "lag", "pause", "stuck",
                       "stops suddenly", "won't play", "audio issue", "no sound",
                       "silence", "volume problem"],
    "device issue": ["bluetooth", "speaker", "carplay", "android auto", "smart tv",
                     "echo", "alexa", "chromecast", "airplay", "headphones",
                     "device not showing", "device disconnected"],
    "content restriction": ["song not available", "track unavailable", "region restriction",
                            "country restriction", "not licensed", "greyed out", "removed song"],
    "ad issue": ["ads", "advertisement", "too many ads", "ad volume",
                 "ad playing", "premium ads", "commercials"],
    "recommendation issue": ["recommendations", "discover weekly", "radio", "algorithm",
                             "curated", "autoplay", "song suggestions", "not relevant"],
    "ui issue": ["interface", "layout", "design", "dark mode",
                 "buttons not working", "search not working", "filter not working"],
    "general feedback": ["suggestion", "feedback", "recommend", "love spotify",
                         "like app", "app improvement", "feature request"],
    "network failure": ["network", "connectivity", "internet", "server",
                        "connection failed", "offline", "not connecting",
                        "spotify down", "timeout", "dns", "proxy", "vpn"],
    "app crash": ["crash", "crashed", "app closed", "stopped working", "freeze",
                  "freezing", "hang", "lag", "bug", "error message", "glitch",
                  "slow performance", "unresponsive"],
    "performance issue": ["slow", "lag", "delay", "performance"],
    "data sync issue": ["sync", "not syncing", "listening history", "recently played",
                        "activity feed", "spotify connect", "data lost", "missing data"],
    "subscription issue": ["subscription", "plan", "premium", "cancel", "renew",
                           "billing", "charged", "payment", "refund", "invoice",
                           "upgrade", "downgrade", "free trial", "family plan",
                           "student plan", "gift card", "promo code", "spotify wrapped",
                           "card"],
}

SUBCATEGORY_KEYWORDS = {
    "subscription issue": {
        "payment": ["refund", "charged", "billing", "invoice", "payment"],
        "cancel": ["cancel", "unsubscribe", "stop subscription"],
        "upgrade": ["upgrade", "family plan", "student plan", "premium"],
    },
    "account issue": {
        "login": ["login", "password", "signin"],
        "profile": ["profile", "email", "username", "display name"],
    },
    "device issue": {
        "mobile": ["phone", "android", "iphone"],
        "car": ["carplay", "android auto"],
    },
}


# ============================================================
# ‚öñÔ∏è Rules and Helper Functions
# ============================================================
RULE_OVERRIDES = [
    (lambda txt: any(k in txt.lower() for k in ["refund", "charged", "billing"]),
     lambda preds: {**preds, "category": "subscription issue", "subcategory": "payment", "sentiment": "negative"}),
    (lambda txt: "cancel" in txt.lower(),
     lambda preds: {**preds, "category": "subscription issue", "subcategory": "cancel"}),
]

def is_english(text):
    try:
        return detect(text) == "en"
    except Exception:
        return False

def extract_consumer_text(transcript):
    if not isinstance(transcript, str):
        return ""
    parts = re.findall(r"(?i)Consumer:\s*(.*?)(?=\s*\|\s*\d{4}-\d{2}-\d{2}|$|\s*\|\s*Agent:)", transcript + " ")
    if not parts:
        parts = re.findall(r"(?i)Consumer:\s*(.*?)(?=\||$)", transcript + "|")
    return " ".join(p.strip() for p in parts if p.strip())

def hybrid_sentiment(text):
    if not text or not is_english(text):
        return ""
    tb_score = TextBlob(text).sentiment.polarity
    af_score = af.score(text) / 5.0
    score = 0.6 * tb_score + 0.4 * af_score
    if score <= -0.75: return "very negative"
    elif score <= -0.25: return "negative"
    elif score >= 0.75: return "very positive"
    elif score >= 0.25: return "positive"
    else: return "neutral"

def predict_category(text):
    text_lower = text.lower()
    best_match, best_score = "", 0
    for category, keywords in TOPIC_KEYWORDS.items():
        matches = sum(k in text_lower for k in keywords)
        if matches > best_score:
            best_score = matches
            best_match = category
    return best_match

def predict_subcategory(category, text):
    if not category or category not in SUBCATEGORY_KEYWORDS:
        return ""
    text_lower = text.lower()
    for sub, keywords in SUBCATEGORY_KEYWORDS[category].items():
        if any(k in text_lower for k in keywords):
            return sub
    return ""

def apply_rules(text, preds):
    for cond, override in RULE_OVERRIDES:
        if cond(text):
            preds = override(preds)
    return preds


# ============================================================
# üîÑ Core Row Processing
# ============================================================
def process_row(row):
    conversation_id = row.get("Conversation Id", "")
    transcript = str(row.get("transcripts", ""))
    consumer_text = extract_consumer_text(transcript)
    
    if not consumer_text.strip() or not is_english(consumer_text):
        return {
            "Conversation Id": conversation_id,
            "Consumer_Text": consumer_text,
            "Category": "",
            "Subcategory": "",
            "Sentiment": "",
        }
    
    preds = {
        "category": predict_category(consumer_text),
        "subcategory": "",
        "sentiment": hybrid_sentiment(consumer_text),
    }
    preds["subcategory"] = predict_subcategory(preds["category"], consumer_text)
    preds = apply_rules(consumer_text, preds)
    
    return {
        "Conversation Id": conversation_id,
        "Consumer_Text": consumer_text,
        "Category": preds["category"],
        "Subcategory": preds["subcategory"],
        "Sentiment": preds["sentiment"],
    }


# ============================================================
# üßÆ Run NLP Pipeline with Mercury Integration
# ============================================================
def run_pipeline(uploaded_file):
    start = time.time()
    
    # Mercury 4.x: use .filepath instead of .path
    file_path = uploaded_file.filepath
    
    # Get original filename for extension check
    original_name = getattr(uploaded_file, 'filename', '') or getattr(uploaded_file, 'name', '')
    
    # Load dataset - try both formats
    df = None
    error_msg = ""
    
    # First, try based on original filename
    if original_name.lower().endswith('.xlsx'):
        try:
            df = pd.read_excel(file_path)
        except Exception as e:
            error_msg = f"Excel read error: {str(e)}"
    elif original_name.lower().endswith('.csv'):
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            error_msg = f"CSV read error: {str(e)}"
    
    # If original name didn't work, try auto-detection
    if df is None:
        try:
            # Try Excel first
            df = pd.read_excel(file_path)
        except:
            try:
                # Try CSV
                df = pd.read_csv(file_path)
            except Exception as e:
                raise ValueError(f"Could not read file. {error_msg}. Last attempt: {str(e)}")
    
    if df is None:
        raise ValueError("Unsupported file format. Upload a .csv or .xlsx file.")
    
    # Validate required columns
    if "Conversation Id" not in df.columns or "transcripts" not in df.columns:
        raise ValueError("Input file must contain 'Conversation Id' and 'transcripts' columns.")
    
    # Parallel processing
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        results = list(executor.map(process_row, df.to_dict("records")))
    
    # Create output dataframe
    out_df = pd.DataFrame(results)
    out_df.to_csv("sentiment_output.csv", index=False)
    
    elapsed = time.time() - start
    print(f"‚úÖ Completed in {elapsed:.2f}s. Processed {len(out_df)} rows.")
    
    return out_df


# ============================================================
# üñ•Ô∏è Execute & Display Results
# ============================================================
if run_button.clicked:
    if file is None or not hasattr(file, 'filepath') or file.filepath is None:
        mr.Markdown("### ‚ö†Ô∏è Please upload a .csv or .xlsx file before running the pipeline.")
    elif nlp is None:
        mr.Markdown("### ‚ùå Error: spaCy model not installed")
        mr.Markdown("Please run in your terminal: `python -m spacy download en_core_web_sm`")
    else:
        try:
            mr.Markdown("### üîÑ Processing...")
            df_result = run_pipeline(file)
            
            mr.Markdown("### ‚úÖ Processing Complete!")
            mr.Markdown(f"**Total rows processed:** {len(df_result)}")
            
            # Display results - just show the dataframe directly
            mr.Markdown("### üìä Results Preview (First 100 rows)")
            df_result.head(100)  # Jupyter will display this automatically
            
            # Download button
            mr.Markdown("### üì• Download Results")
            mr.Markdown("[Download sentiment_output.csv](sentiment_output.csv)")
            
        except Exception as e:
            mr.Markdown(f"### ‚ùå Error processing file")
            mr.Markdown(f"**Error details:** {str(e)}")
else:
    mr.Markdown("### üëã Welcome to NLP Text Classification Dashboard")
    mr.Markdown("1. Upload your CSV or Excel file containing 'Conversation Id' and 'transcripts' columns")
    mr.Markdown("2. Click the 'Run NLP Pipeline' button to start processing")