<a href="https://colab.research.google.com/github/toecm/iedi-mas/blob/main/IEDI_MAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -q openai-whisper rapidfuzz pandas gradio datasets transformers torchaudio torch librosa pydub ffmpeg-python jiwer google-generativeai python-dotenv requests yt-dlp soundfile

import os
import glob
import torch
import whisper
import pandas as pd
import requests
import tempfile
import yt_dlp
import random
import soundfile as sf
from pydub import AudioSegment
from rapidfuzz import process, fuzz
import google.generativeai as genai
from datasets import load_dataset, Audio
import gradio as gr
from dotenv import load_dotenv
from threading import Lock
from huggingface_hub import HfApi, hf_hub_download, upload_file

# --- CONFIGURATION ---
# REPLACED REPO ID HERE
HF_REPO_ID = "toecm/IEDID"
HF_TOKEN = os.getenv("HF_TOKEN") # Ensure this is set in your secrets!

DATASET_DIR = "/content/iuuy_datasets"
os.makedirs(DATASET_DIR, exist_ok=True)

# Load Env Vars
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
PINATA_JWT = os.getenv("PINATA_JWT")

gemini_model = None # Initialize gemini_model to None
if GOOGLE_API_KEY:
    genai.configure(api_key=GOOGLE_API_KEY)
    gemini_model = genai.GenerativeModel("gemini-2.5-flash")

# --- HUGGING FACE SYNC MANAGER ---
class HFManager:
    def __init__(self):
        self.api = HfApi(token=HF_TOKEN)
        self.lock = Lock()

    def pull_datasets(self):
        """Downloads all CSVs from the HF Repo to local runtime."""
        print("‚¨áÔ∏è Pulling datasets from Hugging Face...")
        try:
            # List files in repo
            files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset")
            csv_files = [f for f in files if f.endswith(".csv")]

            if not csv_files:
                print("‚ö†Ô∏è No CSVs found in HF Repo. Seeding initial data...")
                seed_initial_data() # Fallback to seeding if empty
                return

            for file in csv_files:
                local_path = hf_hub_download(
                    repo_id=HF_REPO_ID,
                    filename=file,
                    repo_type="dataset",
                    local_dir=DATASET_DIR,
                    token=HF_TOKEN
                )
                print(f"   -> Downloaded: {file}")
        except Exception as e:
            print(f"‚ùå HF Pull Error: {e}")
            # Fallback to seed if connection fails
            seed_initial_data()

    def push_update(self, filepath, commit_msg="Update from IEDI-MAS"):
        """Uploads a specific modified CSV back to HF."""
        filename = os.path.basename(filepath)
        print(f"‚¨ÜÔ∏è Pushing update to Hugging Face: {filename}...")
        try:
            self.api.upload_file(
                path_or_fileobj=filepath,
                path_in_repo=filename,
                repo_id=HF_REPO_ID,
                repo_type="dataset",
                commit_message=commit_msg
            )
            print("‚úÖ Sync Complete!")
        except Exception as e:
            print(f"‚ùå HF Push Error: {e}")

    def upload_audio_sample(self, audio_path, dialect):
        """
        Uploads a wav file to HF and returns the relative path for the CSV.
        """
        clean_dialect = dialect.strip()
        filename = os.path.basename(audio_path)

        # Structure: audio/Nigerian English/timestamp_uuid.wav
        # This structure is crucial for AudioFolder dataset format
        hf_path = f"audio/{clean_dialect}/{filename}"

        print(f"‚¨ÜÔ∏è Uploading audio: {hf_path}...")
        try:
            self.api.upload_file(
                path_or_fileobj=audio_path,
                path_in_repo=hf_path,
                repo_id=HF_REPO_ID,
                repo_type="dataset",
                commit_message=f"Add audio sample for {clean_dialect}"
            )
            return hf_path
        except Exception as e:
            print(f"‚ùå Audio Upload Error: {e}")
            return None

hf_manager = HFManager()

# --- INITIAL DATA SEEDING (FULL LISTS) ---
def seed_initial_data():
    initial_data = {
        "Nigerian English": [
            {"Utterance": "I beg, let's go.", "Clarification": "Please, let's leave now.", "Syntax_Pattern": r"\bi\s?beg\b"},
            {"Utterance": "Do quick.", "Clarification": "Hurry up.", "Syntax_Pattern": r"\bdo\s?quick\b"},
            {"Utterance": "I am coming.", "Clarification": "I will be back in a moment.", "Syntax_Pattern": r"\bi\s?am\s?coming\b"},
            {"Utterance": "He is not on seat.", "Clarification": "He is not currently at his desk.", "Syntax_Pattern": r"\bon\s?seat\b"},
            {"Utterance": "Don't flash me.", "Clarification": "Don't call and hang up immediately.", "Syntax_Pattern": r"\bflash\s?me\b"},
            {"Utterance": "How far?", "Clarification": "How are you doing?", "Syntax_Pattern": r"\bhow\s?far\b"},
            {"Utterance": "I want to dash him money.", "Clarification": "I want to give him a gift/tip.", "Syntax_Pattern": r"\bdash\b"},
            {"Utterance": "The traffic was a go-slow.", "Clarification": "The traffic was moving very slowly.", "Syntax_Pattern": r"\bgo-?slow\b"},
            {"Utterance": "Have you eaten?", "Clarification": "How are you? (Greeting)", "Syntax_Pattern": r"\bhave\s?you\s?eaten\b"},
            {"Utterance": "Can you branch at the shop?", "Clarification": "Can you stop by the shop?", "Syntax_Pattern": r"\bbranch\s?at\b"},
            {"Utterance": "I will escort you out.", "Clarification": "I will walk you to the door.", "Syntax_Pattern": r"\bescort\s?you\b"},
            {"Utterance": "Stop horn-ing.", "Clarification": "Stop honking the horn.", "Syntax_Pattern": r"\bhorn(ing)?\b"},
            {"Utterance": "He is my junior brother.", "Clarification": "He is my younger brother.", "Syntax_Pattern": r"\bjunior\s?brother\b"},
            {"Utterance": "I will alight here.", "Clarification": "I will get off the bus here.", "Syntax_Pattern": r"\balight\b"},
            {"Utterance": "She has taken in.", "Clarification": "She has become pregnant.", "Syntax_Pattern": r"\btaken\s?in\b"},
            {"Utterance": "More grease to your elbow.", "Clarification": "Good job / Keep it up.", "Syntax_Pattern": r"\bmore\s?grease\b"},
            {"Utterance": "No wahala.", "Clarification": "No problem / No trouble.", "Syntax_Pattern": r"\bwahala\b"},
            {"Utterance": "You are talking rubbish.", "Clarification": "You are not making sense.", "Syntax_Pattern": r"\brubbish\b"},
            {"Utterance": "Let me gist you.", "Clarification": "Let me tell you the gossip/story.", "Syntax_Pattern": r"\bgist\b"},
            {"Utterance": "Are you tight?", "Clarification": "Are you busy?", "Syntax_Pattern": r"\bare\s?you\s?tight\b"}
        ],
        "Korean English": [
            {"Utterance": "He was living.", "Clarification": "He was alive.", "Syntax_Pattern": r"\bwas\s?living\b"},
            {"Utterance": "Please check once.", "Clarification": "Please check it briefly.", "Syntax_Pattern": r"\bcheck\s?once\b"},
            {"Utterance": "I need your sign.", "Clarification": "I need your signature/autograph.", "Syntax_Pattern": r"\byour\s?sign\b"},
            {"Utterance": "We had a meeting.", "Clarification": "We went on a group blind date.", "Syntax_Pattern": r"\ba\s?meeting\b"},
            {"Utterance": "Turn the handle.", "Clarification": "Turn the steering wheel.", "Syntax_Pattern": r"\bhandle\b"},
            {"Utterance": "This service is free.", "Clarification": "This extra item is on the house.", "Syntax_Pattern": r"\bservice\b"},
            {"Utterance": "Don't do over.", "Clarification": "Don't exaggerate.", "Syntax_Pattern": r"\bdo\s?over\b"},
            {"Utterance": "My condition is bad.", "Clarification": "I am not feeling well.", "Syntax_Pattern": r"\bcondition\s?is\s?bad\b"},
            {"Utterance": "Did you eat lunch?", "Clarification": "How are you?", "Syntax_Pattern": r"\bdid\s?you\s?eat\b"},
            {"Utterance": "Fighting!", "Clarification": "Cheer up! / You can do it!", "Syntax_Pattern": r"\bfighting!?\b"},
            {"Utterance": "He is a gag man.", "Clarification": "He is a comedian.", "Syntax_Pattern": r"\bgag\s?man\b"},
            {"Utterance": "I watched a cunning movie.", "Clarification": "I watched a cheating/affair movie.", "Syntax_Pattern": r"\bcunning\b"},
            {"Utterance": "Use the concent.", "Clarification": "Use the power outlet/socket.", "Syntax_Pattern": r"\bconcent\b"},
            {"Utterance": "Did you see the grand open?", "Clarification": "Did you see the grand opening?", "Syntax_Pattern": r"\bgrand\s?open\b"},
            {"Utterance": "Too much skinship.", "Clarification": "Too much physical affection.", "Syntax_Pattern": r"\bskinship\b"},
            {"Utterance": "I will dutch pay.", "Clarification": "Let's split the bill.", "Syntax_Pattern": r"\bdutch\s?pay\b"},
            {"Utterance": "She is glamour.", "Clarification": "She is curvaceous/voluptuous.", "Syntax_Pattern": r"\bglamour\b"},
            {"Utterance": "One shot!", "Clarification": "Bottoms up!", "Syntax_Pattern": r"\bone\s?shot\b"},
            {"Utterance": "It is hard training.", "Clarification": "It is intensive training.", "Syntax_Pattern": r"\bhard\s?training\b"},
            {"Utterance": "Keep the promise.", "Clarification": "Keep your word.", "Syntax_Pattern": r"\bkeep\s?the\s?promise\b"}
        ],
        "Indian English": [
            {"Utterance": "I will prepone the meeting.", "Clarification": "I will bring the meeting forward.", "Syntax_Pattern": r"\bprepone\b"},
            {"Utterance": "Do the needful.", "Clarification": "Please do what is required.", "Syntax_Pattern": r"\bdo\s?the\s?needful\b"},
            {"Utterance": "He passed out last year.", "Clarification": "He graduated last year.", "Syntax_Pattern": r"\bpassed\s?out\b"},
            {"Utterance": "I have a doubt.", "Clarification": "I have a question.", "Syntax_Pattern": r"\bhave\s?a\s?doubt\b"},
            {"Utterance": "Please revert back.", "Clarification": "Please reply.", "Syntax_Pattern": r"\brevert\s?back\b"},
            {"Utterance": "She is my cousin sister.", "Clarification": "She is my female cousin.", "Syntax_Pattern": r"\bcousin\s?sister\b"},
            {"Utterance": "What is your good name?", "Clarification": "What is your name.", "Syntax_Pattern": r"\bgood\s?name\b"},
            {"Utterance": "I am out of station.", "Clarification": "I am out of town.", "Syntax_Pattern": r"\bout\s?of\s?station\b"},
            {"Utterance": "Tell me only.", "Clarification": "Just tell me.", "Syntax_Pattern": r"\btell\s?me\s?only\b"},
            {"Utterance": "He is eating my head.", "Clarification": "He is annoying me.", "Syntax_Pattern": r"\beating\s?my\s?head\b"},
            {"Utterance": "Are you coming, na?", "Clarification": "Are you coming?", "Syntax_Pattern": r",\s?na\b"},
            {"Utterance": "Let us discuss about this.", "Clarification": "Let us discuss this.", "Syntax_Pattern": r"\bdiscuss\s?about\b"},
            {"Utterance": "Kindly adjust.", "Clarification": "Please make room / accommodate.", "Syntax_Pattern": r"\bkindly\s?adjust\b"},
            {"Utterance": "Timepass.", "Clarification": "Something to pass the time / trivial.", "Syntax_Pattern": r"\btimepass\b"},
            {"Utterance": "He is sitting on my head.", "Clarification": "He is pressuring me.", "Syntax_Pattern": r"\bsitting\s?on\s?my\s?head\b"},
            {"Utterance": "Today morning.", "Clarification": "This morning.", "Syntax_Pattern": r"\btoday\s?morning\b"},
            {"Utterance": "Years back.", "Clarification": "Years ago.", "Syntax_Pattern": r"\byears\s?back\b"},
            {"Utterance": "First-class.", "Clarification": "Excellent / Very good.", "Syntax_Pattern": r"\bfirst-?class\b"},
            {"Utterance": "Don't eat my brain.", "Clarification": "Stop bothering me.", "Syntax_Pattern": r"\beat\s?my\s?brain\b"},
            {"Utterance": "Shift a bit.", "Clarification": "Move over slightly.", "Syntax_Pattern": r"\bshift\s?a\s?bit\b"}
        ],
        "Malaysian English": [
            {"Utterance": "Can or not?", "Clarification": "Is that possible?", "Syntax_Pattern": r"\bcan\s?or\s?not\b"},
            {"Utterance": "I send you to airport.", "Clarification": "I will drive you to the airport.", "Syntax_Pattern": r"\bi\s?send\s?you\b"},
            {"Utterance": "Why you so like that?", "Clarification": "Why are you behaving that way?", "Syntax_Pattern": r"\bso\s?like\s?that\b"},
            {"Utterance": "Got problem ah?", "Clarification": "Is there a problem?", "Syntax_Pattern": r"\bgot\s?problem\b"},
            {"Utterance": "Where got?", "Clarification": "That's not true / I don't have that.", "Syntax_Pattern": r"\bwhere\s?got\b"},
            {"Utterance": "Don't play play.", "Clarification": "Don't fool around / Take this seriously.", "Syntax_Pattern": r"\bplay\s?play\b"},
            {"Utterance": "Open the light.", "Clarification": "Turn on the light.", "Syntax_Pattern": r"\bopen\s?the\s?light\b"},
            {"Utterance": "Close the radio.", "Clarification": "Turn off the radio.", "Syntax_Pattern": r"\bclose\s?the\s?radio\b"},
            {"Utterance": "Are you coming onot?", "Clarification": "Are you coming or not?", "Syntax_Pattern": r"\bonot\b"},
            {"Utterance": "He terror at math.", "Clarification": "He is excellent at math.", "Syntax_Pattern": r"\bterror\s?at\b"},
            {"Utterance": "Can lah.", "Clarification": "It is certainly possible.", "Syntax_Pattern": r"\bcan\s?lah\b"},
            {"Utterance": "You sit first.", "Clarification": "Please take a seat.", "Syntax_Pattern": r"\bsit\s?first\b"},
            {"Utterance": "Gostan the car.", "Clarification": "Reverse the car.", "Syntax_Pattern": r"\bgostan\b"},
            {"Utterance": "Simply say only.", "Clarification": "Just guessing / Just saying.", "Syntax_Pattern": r"\bsimply\s?say\b"},
            {"Utterance": "One two jus.", "Clarification": "Rock, paper, scissors.", "Syntax_Pattern": r"\bjus\b"},
            {"Utterance": "Also can.", "Clarification": "That works too.", "Syntax_Pattern": r"\balso\s?can\b"},
            {"Utterance": "Sure mou?", "Clarification": "Are you sure?", "Syntax_Pattern": r"\bsure\s?mou\b"},
            {"Utterance": "Action only.", "Clarification": "He is just showing off.", "Syntax_Pattern": r"\baction\s?only\b"},
            {"Utterance": "Please off the fan.", "Clarification": "Please turn off the fan.", "Syntax_Pattern": r"\bplease\s?off\b"},
            {"Utterance": "He is outstation.", "Clarification": "He is out of town.", "Syntax_Pattern": r"\boutstation\b"}
        ],
        "Indonesian English": [
            {"Utterance": "Please open your shoes.", "Clarification": "Please take off your shoes.", "Syntax_Pattern": r"\bopen\s?(your|the)?\s?shoes\b"},
            {"Utterance": "Close the light.", "Clarification": "Turn off the light.", "Syntax_Pattern": r"\bclose\s?(the)?\s?light\b"},
            {"Utterance": "Thanks before.", "Clarification": "Thanks in advance.", "Syntax_Pattern": r"\bthanks\s?before\b"},
            {"Utterance": "I will follow you.", "Clarification": "I will accompany/join you.", "Syntax_Pattern": r"\bfollow\s?you\b"},
            {"Utterance": "Join with us.", "Clarification": "Join us.", "Syntax_Pattern": r"\bjoin\s?with\s?us\b"},
            {"Utterance": "I am boring here.", "Clarification": "I am bored here.", "Syntax_Pattern": r"\bam\s?boring\b"},
            {"Utterance": "Don't be arrogant.", "Clarification": "Don't be snobbish.", "Syntax_Pattern": r"\barrogant\b"},
            {"Utterance": "Keep your bag.", "Clarification": "Put away your bag.", "Syntax_Pattern": r"\bkeep\s?your\b"},
            {"Utterance": "Have you ever go to Bali?", "Clarification": "Have you ever been to Bali?", "Syntax_Pattern": r"\bhave\s?you\s?ever\s?go\b"},
            {"Utterance": "Same same.", "Clarification": "It is the same.", "Syntax_Pattern": r"\bsame\s?same\b"},
            {"Utterance": "First first.", "Clarification": "Wait a moment / Me first.", "Syntax_Pattern": r"\bfirst\s?first\b"},
            {"Utterance": "Ask to him.", "Clarification": "Ask him.", "Syntax_Pattern": r"\bask\s?to\b"},
            {"Utterance": "I already eat.", "Clarification": "I have already eaten.", "Syntax_Pattern": r"\balready\s?eat\b"},
            {"Utterance": "Buy for me food.", "Clarification": "Buy food for me.", "Syntax_Pattern": r"\bbuy\s?for\s?me\b"},
            {"Utterance": "The food is very delicious.", "Clarification": "The food is delicious.", "Syntax_Pattern": r"\bvery\s?delicious\b"},
            {"Utterance": "Make it fast.", "Clarification": "Hurry up.", "Syntax_Pattern": r"\bmake\s?it\s?fast\b"},
            {"Utterance": "Is it?", "Clarification": "Really? / Is that so?", "Syntax_Pattern": r"\bis\s?it\??$"},
            {"Utterance": "Where you stay?", "Clarification": "Where do you live?", "Syntax_Pattern": r"\bwhere\s?you\s?stay\b"},
            {"Utterance": "Take profit.", "Clarification": "Benefit from / Take advantage.", "Syntax_Pattern": r"\btake\s?profit\b"},
            {"Utterance": "Please enter inside.", "Clarification": "Please come in.", "Syntax_Pattern": r"\benter\s?inside\b"}
        ],
        "American English": [
            {"Utterance": "I'm gonna grab some coffee.", "Clarification": "I'm going to get some coffee.", "Syntax_Pattern": r"\bgrab\s?some\b"},
            {"Utterance": "My bad.", "Clarification": "My mistake / I apologize.", "Syntax_Pattern": r"\bmy\s?bad\b"},
            {"Utterance": "I'll take a rain check.", "Clarification": "I will reschedule for another time.", "Syntax_Pattern": r"\brain\s?check\b"},
            {"Utterance": "Let's touch base later.", "Clarification": "Let's contact each other later.", "Syntax_Pattern": r"\btouch\s?base\b"},
            {"Utterance": "Shoot me an email.", "Clarification": "Send me an email.", "Syntax_Pattern": r"\bshoot\s?me\b"},
            {"Utterance": "Piece of cake.", "Clarification": "Very easy.", "Syntax_Pattern": r"\bpiece\s?of\s?cake\b"},
            {"Utterance": "Spill the beans.", "Clarification": "Reveal the secret.", "Syntax_Pattern": r"\bspill\s?the\s?beans\b"},
            {"Utterance": "Break a leg.", "Clarification": "Good luck.", "Syntax_Pattern": r"\bbreak\s?a\s?leg\b"},
            {"Utterance": "Wrap it up.", "Clarification": "Finish it.", "Syntax_Pattern": r"\bwrap\s?it\s?up\b"},
            {"Utterance": "Hang tight.", "Clarification": "Wait a moment.", "Syntax_Pattern": r"\bhang\s?tight\b"},
            {"Utterance": "Heads up.", "Clarification": "Warning / Notification.", "Syntax_Pattern": r"\bheads\s?up\b"},
            {"Utterance": "It's a no-brainer.", "Clarification": "It's an obvious decision.", "Syntax_Pattern": r"\bno-brainer\b"},
            {"Utterance": "Keep me in the loop.", "Clarification": "Keep me informed.", "Syntax_Pattern": r"\bin\s?the\s?loop\b"},
            {"Utterance": "Hit the books.", "Clarification": "Study.", "Syntax_Pattern": r"\bhit\s?the\s?books\b"},
            {"Utterance": "Pitch in.", "Clarification": "Contribute / Help out.", "Syntax_Pattern": r"\bpitch\s?in\b"},
            {"Utterance": "Cold turkey.", "Clarification": "Stopping abruptly.", "Syntax_Pattern": r"\bcold\s?turkey\b"},
            {"Utterance": "Play it by ear.", "Clarification": "Decide as we go along.", "Syntax_Pattern": r"\bplay\s?it\s?by\\s?ear\b"},
            {"Utterance": "Under the weather.", "Clarification": "Sick / Ill.", "Syntax_Pattern": r"\bunder\s?the\s?weather\b"},
            {"Utterance": "Ring a bell?", "Clarification": "Sound familiar?", "Syntax_Pattern": r"\bring\s?a\s?bell\b"},
            {"Utterance": "Cut to the chase.", "Clarification": "Get to the point.", "Syntax_Pattern": r"\bcut\s?to\s?the\s?chase\b"}
        ]
    }

    for dialect, rows in initial_data.items():
        filepath = os.path.join(DATASET_DIR, f"{dialect}.csv")
        if not os.path.exists(filepath):
            df = pd.DataFrame(rows)
            df["Dialect"] = dialect
            # Initialize file_name column for audio tracking (Critical for Multimodal)
            df["file_name"] = ""
            df.to_csv(filepath, index=False)
            print(f"‚úÖ Created local seed: {filepath}")
            # Push seed data to HF immediately
            hf_manager.push_update(filepath, "Initial Seed") # <--- Ensure this is UNCOMMENTED

# Load data on startup
hf_manager.pull_datasets()

# --- AGENT 1: INPUT (Whisper) ---
class AgentInput:
    def __init__(self, model_size="small"):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"üëÇ Agent 1 (Input) Online: Loading Whisper ({model_size}) on {device}...")
        self.model = whisper.load_model(model_size, device=device)

    def transcribe(self, audio_path, language="en"):
        if not audio_path: return []
        result = self.model.transcribe(audio_path, language=language)
        # In a persistent storage setup we might want to keep these,
        # but usually temp files are cleared to save space unless explicitly saved
        try:
            if os.path.exists(audio_path):
                # We don't remove here if we plan to upload it later
                pass
        except:
            pass
        return [
            {"speaker": "Speaker", "text": seg["text"].strip(), "start": seg["start"], "end": seg["end"]}
            for seg in result["segments"]
        ]

    def get_audio_from_youtube(self, url):
        opts = {'format': 'bestaudio', 'outtmpl': 'external_audio.%(ext)s', 'quiet': True}
        with yt_dlp.YoutubeDL(opts) as ydl:
            info = ydl.extract_info(url, download=True)
            return ydl.prepare_filename(info), info.get('title', 'No Title')

# --- AGENT 2: INTERPRETATION (Dynamic Brain) ---
class AgentInterpretation:
    def __init__(self, gemini_model_instance=None):
        self.df = pd.DataFrame()
        self.lookup_list = []
        self.gemini_model = gemini_model_instance # Store the model instance
        print("üß† Agent 2 (Interpretation) Online: Loading Datasets...")
        self.refresh_knowledge_base()

    def refresh_knowledge_base(self):
        # Re-pull from HF to ensure we have latest version from ANY user
        # (Optional: disable if you want faster local-only loops)
        # hf_manager.pull_datasets()

        all_files = glob.glob(os.path.join(DATASET_DIR, "*.csv"))
        df_list = []

        for filename in all_files:
            try:
                dialect_name = os.path.basename(filename).replace(".csv", "")
                temp_df = pd.read_csv(filename)
                # Ensure Dialect column matches filename
                temp_df["Dialect"] = dialect_name
                df_list.append(temp_df)
            except Exception as e:
                print(f"‚ö†Ô∏è Error loading {filename}: {e}")

        if df_list:
            self.df = pd.concat(df_list, ignore_index=True)
            if "Utterance" in self.df.columns:
                self.lookup_list = self.df["Utterance"].tolist()
                print(f"   -> Loaded {len(self.df)} dialect phrases into Memory.")
            else:
                self.lookup_list = []
        else:
            self.lookup_list = []

    def detect_dialect(self, text, threshold=75):
        if not self.lookup_list or not text:
            return "Unknown", None, None

        # Fast Fuzzy Search
        match = process.extractOne(text, self.lookup_list, scorer=fuzz.ratio)

        if match:
            best_utterance, score, index = match
            if score >= threshold:
                row = self.df.iloc[index]
                # Return the clean, matched utterance from the DB (best_utterance)
                return row["Dialect"], row["Clarification"], best_utterance

        return "Unknown", None, None

    def generate_syntax_pattern(self, utterance):
        """Self-Training: Uses LLM to generate regex by studying existing data."""
        if not self.gemini_model:
            return r"\b" + utterance.lower().replace(" ", r"\s?") + r"\b"

        # Few-Shot Learning from current dataset
        examples = []
        if not self.df.empty and "Syntax_Pattern" in self.df.columns:
            valid_rows = self.df.dropna(subset=["Syntax_Pattern"])
            if len(valid_rows) > 3:
                sample = valid_rows.sample(3)
                for _, row in sample.iterrows():
                    examples.append(f"Phrase: '{row['Utterance']}' -> Regex: '{row['Syntax_Pattern']}'")

        examples_text = "\n".join(examples)

        prompt = f"""
        Role: Computational Linguist.
        Task: Create a Python Regex for this dialect phrase.
        Context: Study these existing examples:
        {examples_text}

        Target Phrase: '{utterance}'

        Requirements:
        1. Use \b boundaries.
        2. Make spaces optional (\s?).
        3. Output ONLY the regex string.
        """
        try:
            response = self.gemini_model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            print(f"LLM Error: {e}")
            return r"\b" + utterance.lower().replace(" ", r"\s?") + r"\b"

# --- AGENT 4: TRUST (Feedback & Ledger) ---
class AgentTrust:
    def __init__(self):
        self.lock = Lock()
        print("üõ°Ô∏è Agent 4 (Trust) Online: Connected to IPFS Module.")

    def log_to_ipfs(self, data):
        if not PINATA_JWT: return "Local-Log-Only"

        headers = {"Authorization": f"Bearer {PINATA_JWT}"}
        try:
            res = requests.post(
                "https://api.pinata.cloud/pinning/pinJSONToIPFS",
                headers=headers, json=data
            )
            return res.json().get("IpfsHash", "Error")
        except:
            return "IPFS_Fail"

    def process_feedback(self, action, original_text, dialect, clarification, brain_agent, audio_path=None):
        # 1. Log Audit Trail (IPFS)
        timestamp = pd.Timestamp.now().isoformat()
        feedback_data = {
            "original": original_text,
            "dialect": dialect,
            "clarification": clarification,
            "action": action,
            "timestamp": timestamp
        }
        cid = self.log_to_ipfs(feedback_data)
        msg = f"Feedback '{action}' logged. CID: {cid}"

        # 2. Update CSV & Retrain Brain (if Suggest/Accept)
        if action == "Suggest Update":
            syntax = brain_agent.generate_syntax_pattern(original_text)
            # Pass the audio path to be uploaded and saved against the text
            update_msg = self.update_dataset_csv(dialect, original_text, clarification, syntax, audio_path)
            msg += f"\n{update_msg}\nü§ñ Auto-Learned Syntax: {syntax}"

            # Immediate Learning: Reload Brain
            brain_agent.refresh_knowledge_base()

        return msg

    def update_dataset_csv(self, dialect, utterance, clarification, syntax, audio_path=None):
        clean_dialect = dialect.strip()
        if not clean_dialect: return "‚ùå Error: No dialect specified."

        # Logic to correct dialect name if it doesn't end with "Dialect" (User-friendly correction)
        # But wait, the request says "saving the dialect as [name] Dialect even if the 'Dialect' part was forgotten"
        # However, in previous code we used "Nigerian English", "Korean English" etc.
        # So let's assume standard naming is "[Name] English". Or if user inputs "Nigerian", we make it "Nigerian English".
        # The prompt says "[name] Dialect". I will stick to the existing convention of "[Name] English" if that's what's in the DB,
        # or append "Dialect" if it's a completely new custom name to follow the instruction strictly.
        # Let's implement a robust check.

        # Normalize: Title case
        clean_dialect = clean_dialect.title()

        # Heuristic: If it doesn't end with "English" or "Dialect", append "Dialect" as per instruction
        if not clean_dialect.endswith("English") and not clean_dialect.endswith("Dialect"):
             clean_dialect += " Dialect"

        filepath = os.path.join(DATASET_DIR, f"{clean_dialect}.csv")

        with self.lock:
            # Load or Create
            if not os.path.exists(filepath):
                # Creating new dataset CSV with all necessary columns including 'file_name' for audio
                new_df = pd.DataFrame(columns=["Utterance", "Dialect", "Clarification", "Syntax_Pattern", "file_name"])
                new_df.to_csv(filepath, index=False)

            df = pd.read_csv(filepath)

            # Ensure columns exist in case of legacy CSVs
            if "Syntax_Pattern" not in df.columns: df["Syntax_Pattern"] = ""
            if "Dialect" not in df.columns: df["Dialect"] = clean_dialect
            if "file_name" not in df.columns: df["file_name"] = ""

            # 1. Upload Audio First (if provided)
            audio_ref = ""
            if audio_path and os.path.exists(audio_path):
                # Rename file to be unique before upload
                ext = os.path.splitext(audio_path)[1]
                unique_name = f"{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}_{random.randint(1000,9999)}{ext}"
                new_path = os.path.join(os.path.dirname(audio_path), unique_name)
                os.rename(audio_path, new_path)

                # Upload to HF and get the relative path (e.g., "audio/Nigerian English/filename.wav")
                audio_ref = hf_manager.upload_audio_sample(new_path, dialect)

            # Update Logic
            # Check if we need to update an existing row or append a new one
            # If utterance is in the dataset, we update it. Otherwise we append.
            mask = df["Utterance"] == utterance
            if mask.any():
                # Update existing
                df.loc[mask, "Clarification"] = clarification
                df.loc[mask, "Syntax_Pattern"] = syntax
                df.loc[mask, "Dialect"] = clean_dialect

                # IMPORTANT: Also update the Utterance itself if it was corrected in the UI
                # But wait, the 'utterance' passed here is likely the CORRECTED one from the UI text box
                # If we are correcting a misheard phrase, we might want to find the old one?
                # For now, let's assume we are adding/updating based on the text in the "Original Text" box.
                # If the user edits "Original Text", it becomes a new entry.

                if audio_ref:
                    df.loc[mask, "file_name"] = audio_ref
            else:
                # Append new
                new_row = pd.DataFrame([
                    {
                    "Utterance": utterance,
                    "Dialect": clean_dialect,
                    "Clarification": clarification,
                    "Syntax_Pattern": syntax,
                    "file_name": audio_ref # Save the HF audio path
                    }
                ])
                df = pd.concat([df, new_row], ignore_index=True)

            # Save Locally
            df.to_csv(filepath, index=False)

            # üöÄ SYNC TO HUGGING FACE
            hf_manager.push_update(filepath, f"Updated {clean_dialect}: {utterance}")

            return f"‚úÖ Updated & Synced: {clean_dialect}.csv"

# --- AGENT 3: UX (The Interface Manager) ---
class AgentUX:
    def __init__(self, input_agent, brain_agent, trust_agent):
        self.input = input_agent
        self.brain = brain_agent
        self.trust = trust_agent
        self.last_audio_path = None # Keep track of audio for feedback
        print("üé® Agent 3 (UX) Online: Building Interface...")

    def automated_pipeline(self, audio_path, language="en"):
        """Orchestrates the flow: Input -> Brain -> UX"""
        if not audio_path: return pd.DataFrame(), "Waiting for audio..."

        self.last_audio_path = audio_path # Store for later upload

        # Agent 1: Transcribe
        segments = self.input.transcribe(audio_path, language)
        results = []

        for seg in segments:
            # Agent 2: Interpret
            dialect, clarification, matched_utterance = self.brain.detect_dialect(seg["text"])

            # Logic: Use the CLEAN matched utterance if available, else use raw input
            display_utterance = matched_utterance if matched_utterance else seg["text"]

            results.append({
                "Speaker": seg["speaker"],
                "Utterance": display_utterance,
                "Dialect": dialect if dialect else "Unknown",
                "Clarification": clarification if clarification else "---"
            })

        if not results:
            return pd.DataFrame(columns=["Speaker", "Utterance", "Dialect", "Clarification"]), "No speech detected."

        return pd.DataFrame(results), "‚úÖ Analysis Complete"

    def launch(self):
        # Get list of existing dialects for the dropdown
        existing_dialects = []
        if os.path.exists(DATASET_DIR):
            # Get all CSV files
            csv_files = glob.glob(os.path.join(DATASET_DIR, "*.csv"))
            # Extract just the names (e.g., "Nigerian English")
            existing_dialects = [os.path.basename(f).replace(".csv", "") for f in csv_files]

        # Add "Add New" option
        dropdown_choices = existing_dialects + ["+ Add New Dialect"]

        with gr.Blocks(theme=gr.themes.Soft()) as ui:
            gr.Markdown("## üåç IEDI-MAS: Active Listening & Dialect Mediator (4-Agent Architecture)")

            with gr.Row():
                with gr.Column(scale=1):
                    audio_input = gr.Audio(label="üé§ Speak or Upload", sources=["microphone", "upload"], type="filepath")

                    with gr.Accordion("Import from URL", open=False):
                        url_input = gr.Textbox(label="YouTube URL")
                        import_btn = gr.Button("Import & Analyze")

                    lang_select = gr.Dropdown(["en", "ko", "fr"], value="en", label="Language")

                    # --- NEW MANUAL BUTTON ---
                    analyze_btn = gr.Button("Run Analysis (Manual Trigger)")

                with gr.Column(scale=2):
                    status_box = gr.Textbox(label="Status", interactive=False)
                    results_df = gr.Dataframe(
                        headers=["Speaker", "Utterance", "Dialect", "Clarification"],
                        interactive=False,
                        label="Analysis Results (Click row to Correct)",
                        type="pandas"
                    )

            gr.Markdown("### ‚úçÔ∏è Active Feedback Loop (Agent 4: Trust)")
            with gr.Row():
                orig_text_state = gr.Textbox(visible=True, label="Original Text (Edit to Correct)")

                # --- UPDATED DIALECT SELECTION LOGIC ---
                with gr.Column():
                    # Dropdown for existing dialects
                    dialect_dropdown = gr.Dropdown(
                        choices=dropdown_choices,
                        label="Select Dialect",
                        interactive=True,
                        allow_custom_value=False # Force selection or "+ Add New"
                    )
                    # Textbox for new dialect input (hidden by default)
                    new_dialect_input = gr.Textbox(
                        label="Enter New Dialect Name",
                        placeholder="e.g. Australian",
                        visible=False,
                        interactive=True
                    )

                edit_clarification = gr.Textbox(label="Suggest Clarification", interactive=True)

            with gr.Row():
                btn_accept = gr.Button("‚úÖ Accept (Correct)", variant="secondary")
                btn_reject = gr.Button("‚ùå Reject (Incorrect)", variant="stop")
                btn_suggest = gr.Button("üíæ Suggest Update (Auto-Learn Syntax)", variant="primary")

            feedback_out = gr.Markdown()

            # --- LOGIC FOR SHOWING/HIDING NEW DIALECT INPUT ---
            def on_dialect_change(selected_val):
                if selected_val == "+ Add New Dialect":
                    return gr.update(visible=True) # Show text input
                return gr.update(visible=False) # Hide text input

            dialect_dropdown.change(fn=on_dialect_change, inputs=dialect_dropdown, outputs=new_dialect_input)

            # --- WIRING ---
            # 1. File Upload Trigger
            audio_input.upload(self.automated_pipeline, [audio_input, lang_select], [results_df, status_box])

            # 2. Microphone Stop Trigger (Explicitly for mic)
            audio_input.stop_recording(self.automated_pipeline, [audio_input, lang_select], [results_df, status_box])

            # 3. Manual Button Trigger (Backup)
            analyze_btn.click(self.automated_pipeline, [audio_input, lang_select], [results_df, status_box])

            # Import handler
            def process_import(url):
                path, title = self.input.get_audio_from_youtube(url)
                # Keep path for potential feedback upload
                self.last_audio_path = path
                return path, f"Imported: {title}"
            import_btn.click(process_import, [url_input], [audio_input, status_box])

            # Select row handler
            def handle_selection(evt: gr.SelectData, df):
                # Check if dataframe is valid and not empty
                if df is None or len(df) == 0:
                    return "", "", "", gr.update(visible=False)

                # Gradio sometimes returns a list of lists (if not type='pandas'), sometimes a DF.
                # We forced type='pandas' in gr.Dataframe, so df should be a DataFrame.
                try:
                    row = df.iloc[evt.index[0]]
                    utterance = row["Utterance"]
                    detected_dialect = row["Dialect"]
                    clarification = row["Clarification"]
                except Exception as e:
                    print(f"Error selecting row: {e}")
                    return "", "", "", gr.update(visible=False)

                # If the detected dialect is in our list, select it. Otherwise clear or select something default.
                if detected_dialect in existing_dialects:
                    return utterance, detected_dialect, clarification, gr.update(visible=False)
                else:
                    # If unknown or not in list, maybe default to empty or handle gracefully
                    return utterance, None, clarification, gr.update(visible=False)

            results_df.select(handle_selection, [results_df], [orig_text_state, dialect_dropdown, edit_clarification, new_dialect_input])

            # Feedback handlers
            def submit_logic(action, orig, dropdown_val, new_val, clar):
                if not orig: return "‚ùå Select a row first."

                # Determine the final dialect string
                final_dialect = dropdown_val
                if dropdown_val == "+ Add New Dialect":
                    if not new_val.strip():
                        return "‚ùå Please type a name for the new dialect."
                    final_dialect = new_val.strip()

                if not final_dialect:
                    return "‚ùå Please select or enter a dialect."

                # Agent 3 delegates feedback processing to Agent 4, passing audio path if suggestion
                audio_to_upload = self.last_audio_path if action == "Suggest Update" else None
                return self.trust.process_feedback(action, orig, final_dialect, clar, self.brain, audio_to_upload)

            # Note: We now pass both dropdown_val and new_val to the submit logic
            btn_accept.click(
                lambda o, d_drop, d_new, c: submit_logic("Accept", o, d_drop, d_new, c),
                [orig_text_state, dialect_dropdown, new_dialect_input, edit_clarification],
                [feedback_out]
            )
            btn_reject.click(
                lambda o, d_drop, d_new, c: submit_logic("Reject", o, d_drop, d_new, c),
                [orig_text_state, dialect_dropdown, new_dialect_input, edit_clarification],
                [feedback_out]
            )
            btn_suggest.click(
                lambda o, d_drop, d_new, c: submit_logic("Suggest Update", o, d_drop, d_new, c),
                [orig_text_state, dialect_dropdown, new_dialect_input, edit_clarification],
                [feedback_out]
            )

        ui.launch(share=True, debug=True)

# --- SYSTEM STARTUP ---
agent1 = AgentInput()
agent2 = AgentInterpretation(gemini_model) # Pass gemini_model to AgentInterpretation
agent4 = AgentTrust()
agent3 = AgentUX(agent1, agent2, agent4) # Agent 3 orchestrates the others

agent3.launch()

  except Exception as e:


‚¨áÔ∏è Pulling datasets from Hugging Face...
   -> Downloaded: American English.csv
   -> Downloaded: Indian English.csv
   -> Downloaded: Indonesian English.csv


Korean%20English.csv:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

   -> Downloaded: Korean English.csv
   -> Downloaded: Malaysian English.csv


Nigerian%20English.csv:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

   -> Downloaded: Nigerian English.csv
üëÇ Agent 1 (Input) Online: Loading Whisper (small) on cpu...
üß† Agent 2 (Interpretation) Online: Loading Datasets...
   -> Loaded 130 dialect phrases into Memory.
üõ°Ô∏è Agent 4 (Trust) Online: Connected to IPFS Module.
üé® Agent 3 (UX) Online: Building Interface...


  with gr.Blocks(theme=gr.themes.Soft()) as ui:


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ae52e4f3826b46f2e5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ae52e4f3826b46f2e5.gradio.live
