<a href="https://colab.research.google.com/github/toecm/iuuy/blob/main/IUUY_beta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📦 Step 1: Install Dependencies

In [2]:
!pip install -q openai-whisper rapidfuzz pandas gradio datasets transformers torchaudio torch librosa pydub ffmpeg-python jiwer google-generativeai python-dotenv requests yt-dlp soundfile


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/803.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m14.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.0/176.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.9/73.9 kB

# 📚 Step 2: Imports & API Configs

In [3]:
import os
import torch
import whisper
import librosa
import soundfile as sf
from pydub import AudioSegment
import pandas as pd
from rapidfuzz import fuzz
import google.generativeai as genai
from datasets import load_dataset, Audio
import gradio as gr
from dotenv import load_dotenv
import yt_dlp
import tempfile

# Load environment variables for API keys
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
PINATA_JWT = os.getenv("PINATA_JWT")

# Configure Google Generative AI and initialize Gemini model
genai.configure(api_key=GOOGLE_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.5-flash")


# 🤖 AGENT 1: Ingestion & Diarization

In [4]:
class AgentInput:
    def __init__(self, whisper_model_size="small"):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        # Load Whisper ASR model
        self.whisper_model = whisper.load_model(whisper_model_size, device=device)

    def get_audio_from_youtube(self, url):
        opts = {'format': 'bestaudio', 'outtmpl': 'external_audio.%(ext)s'}
        with yt_dlp.YoutubeDL(opts) as ydl:
            info = ydl.extract_info(url, download=True)
            fn = ydl.prepare_filename(info)
        return fn, info.get('title', 'No Title')

    def sample_audio_dataset(self, dataset_name, n=3):
        ds = load_dataset(dataset_name, split="train").cast_column("audio", Audio(sampling_rate=16000))
        return ds.shuffle(seed=42).select(range(n))

    def transcribe(self, audio_path, language="en"):
        """
        Transcribe audio forcing specified language to reduce detection time.
        """
        result = self.whisper_model.transcribe(audio_path, language=language)
        return [
            {"speaker": "Speaker", "text": seg["text"], "start": seg["start"], "end": seg["end"]}
            for seg in result["segments"]
        ]

    def trim_audio(self, path, s, e):
        aud = AudioSegment.from_file(path)
        clip = aud[s*1000:e*1000]
        out = f"{os.path.splitext(path)[0]}_trimmed_{s}_{e}.wav"
        clip.export(out, format="wav")
        return out

agent_input = AgentInput()


100%|███████████████████████████████████████| 461M/461M [00:17<00:00, 27.7MiB/s]


#🔍 🧠 AGENT 2: Interpretation (Dialect Detection + Clarification)

In [5]:
class AgentInterpretation:
    def __init__(self, data_path="dialect_data.csv", fuzzy_threshold=80):
        self.data_path = data_path
        self.fuzzy_threshold = fuzzy_threshold
        self._load_data()

    def _load_data(self):
        default_data = [
            {"Utterance": "I am seeing her tomorrow.", "Dialect": "Nigerian English", "Clarification": "I will meet her tomorrow."},
            {"Utterance": "I didn’t see nobody at the party.", "Dialect": "Nigerian English", "Clarification": "I didn’t see anyone at the party."},
            {"Utterance": "You are coming, abi?", "Dialect": "Nigerian English", "Clarification": "You are coming, right?"},
            {"Utterance": "I beg, let's go nau.", "Dialect": "Nigerian English", "Clarification": "Please, let’s just leave already."},
            {"Utterance": "I beg, don't be angry.", "Dialect": "Nigerian English", "Clarification": "Please, don’t be upset."},
            {"Utterance": "How are you, jare?", "Dialect": "Nigerian English", "Clarification": "How are you, dear?"},
            {"Utterance": "I go to office at 9 AM.", "Dialect": "Korean English (Konglish)", "Clarification": "I go to the office at 9 AM."},
            {"Utterance": "It’s raining? I bring umbrella?", "Dialect": "Korean English (Konglish)", "Clarification": "Is it raining? Should I bring an umbrella?"},
            {"Utterance": "She call me yesterday but I couldn’t answer.", "Dialect": "Korean English (Konglish)", "Clarification": "She called me yesterday, but I couldn’t answer."},
            {"Utterance": "I will prepone the meeting to Monday.", "Dialect": "Indian English", "Clarification": "I will move the meeting to Monday."},
            {"Utterance": "Can you give me some more time?", "Dialect": "Indian English", "Clarification": "Could you give me a bit more time?"},
            {"Utterance": "She is a very good girl only.", "Dialect": "Indian English", "Clarification": "She is a very good girl indeed."},
            {"Utterance": "No issues, yaar.", "Dialect": "Indian English", "Clarification": "It’s fine, friend."},
            {"Utterance": "No issues, we’ll manage.", "Dialect": "Indian English", "Clarification": "It’s okay, we will manage."},
            {"Utterance": "No issues, carry on.", "Dialect": "Indian English", "Clarification": "It’s fine, continue."},
            {"Utterance": "I’m gonna grab some coffee.", "Dialect": "American English", "Clarification": "I’m going to get some coffee."},
            {"Utterance": "Do you wanna catch a movie later?", "Dialect": "American English", "Clarification": "Do you want to see a movie later?"},
            {"Utterance": "He likes basketball, right?", "Dialect": "American English", "Clarification": "He likes basketball, doesn’t he?"},
            {"Utterance": "I’m going on holiday next week.", "Dialect": "UK English", "Clarification": "I’m going on vacation next week."},
            {"Utterance": "Could you pop round for tea?", "Dialect": "UK English", "Clarification": "Could you come over for tea?"},
            {"Utterance": "She’s got a new flat.", "Dialect": "UK English", "Clarification": "She has a new apartment."},
        ]
        if os.path.exists(self.data_path):
            self.df = pd.read_csv(self.data_path)
        else:
            self.df = pd.DataFrame(default_data)
            self.df.to_csv(self.data_path, index=False)

    def detect_dialect(self, text):
        best, score = None, 0
        for _, r in self.df.iterrows():
            s = fuzz.ratio(text.lower(), r["Utterance"].lower())
            if s > score:
                best, score = r, s
        if score >= self.fuzzy_threshold:
            return best["Dialect"], best["Clarification"]
        return "Unknown", "No dialect feature detected."

    def clarify(self, phrase):
        return gemini_model.generate_content(f"Explain clearly: '{phrase}'").text

    def finetune_on_feedback(self, feedback):
        # placeholder for RLHF update
        pass

agent_interpretation = AgentInterpretation()


# 🧑‍💻 AGENT 3: Output - UX Display (via Gradio)

In [6]:
class AgentUX:
    pass

agent_ux = AgentUX()


# 🔐 AGENT 4: Trust (Bayesian Update + IPFS Logging)

In [7]:
class AgentTrustLearning:
    def __init__(self, pinata_jwt=None):
        self.pinata_jwt = pinata_jwt
        self.belief_path = "belief_scores.csv"
        # Initialize belief file if missing
        if not os.path.exists(self.belief_path):
            pd.DataFrame(columns=["Dialect", "α_correct", "β_incorrect"]).to_csv(self.belief_path, index=False)
        # In-memory feedback buffer for RLHF
        self.feedback_buffer = []

    def update_beliefs(self, dialect, positive: bool):
        df = pd.read_csv(self.belief_path)
        if dialect in df["Dialect"].values:
            idx = df.index[df["Dialect"] == dialect][0]
            df.at[idx, "α_correct"] += int(positive)
            df.at[idx, "β_incorrect"] += int(not positive)
        else:
            new_row = {"Dialect": dialect, "α_correct": int(positive), "β_incorrect": int(not positive)}
            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
        df.to_csv(self.belief_path, index=False)

    def log_feedback(self, data: dict) -> str:
        headers = {"Authorization": f"Bearer {self.pinata_jwt}"}
        try:
            res = requests.post(
                "https://api.pinata.cloud/pinning/pinJSONToIPFS", headers=headers, json=data
            )
            return res.json().get("IpfsHash", "No CID")
        except Exception as e:
            return f"❌ IPFS Error: {e}"

    def get_belief_score(self, dialect: str) -> float:
        df = pd.read_csv(self.belief_path)
        row = df[df["Dialect"] == dialect]
        if not row.empty:
            alpha = int(row["α_correct"].iloc[0])
            beta = int(row["β_incorrect"].iloc[0])
            total = alpha + beta
            return round(alpha / total, 2) if total > 0 else 0.5
        return 0.5

    def record_feedback(self, feedback: dict, positive: bool = True):
        # Store in buffer
        self.feedback_buffer.append((feedback, positive))
        # Update local beliefs
        self.update_beliefs(feedback.get("Dialect", "Unknown"), positive)
        # Log to IPFS
        cid = self.log_feedback(feedback)
        return cid

    def audit_performance(self, interpretation_agent):
        # Apply RLHF: fine-tune interpretation based on buffered feedback
        for feedback, positive in self.feedback_buffer:
            # update the agent with each feedback instance
            interpretation_agent.finetune_on_feedback(feedback)
        # Clear buffer after applying
        self.feedback_buffer.clear()
        return True

#🔄 External Import Function (YouTube or HuggingFace)

In [8]:
import tempfile, requests

def import_external_audio(source_type, url, sample_count=3, progress=gr.Progress()):
    """
    Imports audio from YouTube or HuggingFace URLs and returns:
    1. audio_path (for playing)
    2. full transcript (for detailed view)
    3. state path (to feed into the pipeline)
    4. status message
    5. title
    """
    progress(0, desc="Importing...")
    if not url:
        return None, "", None, "❌ URL is required", "No Title"

    audio_path = None
    transcript = ""
    status_msg = ""
    title = ""

    if source_type == "YouTube":
        audio_path, title = agent1.get_audio_from_youtube(url)
        if audio_path:
            progress(0.3, desc="Transcribing...")
            # Get full transcription segments
            segments = agent1.transcribe(audio_path)
            # Concatenate texts
            transcript = " ".join(seg["text"] for seg in segments)
            # Preview first 500 chars
            preview = (transcript[:500] + "...") if len(transcript) > 500 else transcript
            status_msg = f"✅ Imported: {preview}"
        else:
            status_msg = "❌ YouTube download failed"
    elif source_type == "HuggingFace":
        try:
            samples = agent1.sample_audio_dataset(url, n=int(sample_count))
            audio_array = samples[0]["audio"]["array"]
            transcript = samples[0]["text"]
            # Write to temp file
            temp_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
            sf.write(temp_path, audio_array, 16000)
            audio_path = temp_path
            preview = (transcript[:500] + "...") if len(transcript) > 500 else transcript
            status_msg = f"✅ HF Sample: {preview}"
            title = "HF Sample"
        except Exception as e:
            status_msg = f"❌ HF Import error: {e}"
    else:
        status_msg = "❌ Unsupported source type."

    return audio_path, transcript, audio_path, status_msg, title

# View Full Transcript Helper
def show_full_transcript(full_transcript):
    return full_transcript or "❌ No transcript available yet."


# 🔄 MAS Pipeline Function

In [9]:
def mas_pipeline(uploaded_file, external_audio_path, language="en"):
    """
    Runs transcription and dialect interpretation on the provided audio.
    """
    audio_path = external_audio_path or uploaded_file
    # Transcribe with specified language
    segments = agent1.transcribe(audio_path, language=language)
    results = []
    for seg in segments:
        # Detect dialect and clarification
        dialect, clarification = agent2.detect_dialect(seg["text"])
        results.append({
            "Speaker": seg.get("speaker", "Speaker"),
            "Utterance": seg["text"],
            "Dialect": dialect,
            "Clarification": clarification
        })
    # Return as DataFrame for UI table
    return pd.DataFrame(results)

# ⚙️ Instantiate All Agents

In [10]:
agent1 = AgentInput()
agent2 = AgentInterpretation()
agent3 = AgentUX()
agent4 = AgentTrustLearning(pinata_jwt=PINATA_JWT)

To see what's in my dialect dataset


In [11]:
print(agent2.df.head())

                           Utterance           Dialect  \
0          I am seeing her tomorrow.  Nigerian English   
1  I didn’t see nobody at the party.  Nigerian English   
2               You are coming, abi?  Nigerian English   
3               I beg, let's go nau.  Nigerian English   
4             I beg, don't be angry.  Nigerian English   

                       Clarification  
0          I will meet her tomorrow.  
1  I didn’t see anyone at the party.  
2             You are coming, right?  
3  Please, let’s just leave already.  
4            Please, don’t be upset.  


# 🎛️ Gradio Interface with Import Button

In [12]:
def load_external_data(file):
    """
    Load a user-provided dialect CSV into the system.
    """
    if file:
        try:
            df = pd.read_csv(file.name)
            df.to_csv(agent2.data_path, index=False)
            agent2._load_data()
            return "✅ Dialect dataset loaded successfully."
        except Exception as e:
            return f"❌ Failed to load CSV: {e}"
    return "❌ No file provided."

with gr.Blocks() as ui:
    gr.Markdown("## I Understand Understand You - MAS + Blockchain (Beta)")

    # External Import
    with gr.Tabs():
        with gr.TabItem("Dialect CSV"):
            csv_file = gr.File(label="Upload Dialect CSV")
            csv_msg = gr.Markdown()
            csv_file.upload(fn=load_external_data, inputs=[csv_file], outputs=[csv_msg])
        with gr.TabItem("External Audio"):
            src_type = gr.Radio(["YouTube", "HuggingFace"], label="Source Type")
            src_url = gr.Textbox(label="URL")
            sample_count = gr.Number(label="HF Samples", value=1)
            import_btn = gr.Button("Import Audio")
            ext_audio = gr.Audio(type="filepath", label="Imported Audio", interactive=False)
            ext_trans = gr.Textbox(label="Transcript Preview", lines=4)
            ext_title = gr.Textbox(label="Title", interactive=False)
            import_btn.click(
                fn=import_external_audio,
                inputs=[src_type, src_url, sample_count],
                outputs=[ext_audio, ext_trans, gr.State(), ext_trans, ext_title]
            )

    # Upload or Record
    audio_input = gr.Audio(type="filepath", label="Upload/Record Audio")

    # Trim Audio
    with gr.Accordion("Trim Audio", open=False):
        start_sec = gr.Number(label="Start Sec", value=0)
        end_sec = gr.Number(label="End Sec", value=10)
        trim_btn = gr.Button("Trim")
        trim_btn.click(
            fn=agent_input.trim_audio,
            inputs=[audio_input, start_sec, end_sec],
            outputs=[audio_input]
        )

    # Analysis
    language_dropdown = gr.Dropdown(choices=["en", "fr", "es", "de", "ko"], value="en", label="Transcription Language")
    analyze_btn = gr.Button("Analyze")
    df_out = gr.Dataframe(headers=["Speaker","Utterance","Dialect","Clarification"], interactive=False)
    analyze_btn.click(
        fn=mas_pipeline,
        inputs=[audio_input, ext_audio, language_dropdown],
        outputs=[df_out]
    )

    # Feedback
    correction = gr.Textbox(label="Suggest Clarification")
    dialect_s = gr.Textbox(label="Suggest Dialect")
    fb_btn = gr.Button("Submit Feedback")
    fb_status = gr.Markdown()
    fb_btn.click(
        fn=run_feedback_loop,
        inputs=[correction, correction, dialect_s],
        outputs=[fb_status]
    )

    # Download
    dl_btn = gr.Button("Download Feedback CSV")
    dl_file = gr.File()
    dl_btn.click(
        fn=download_belief_scores,
        outputs=[dl_file]
    )

    # Clear All Button
    clear_btn = gr.Button("🧹 Clear All")
    def clear_all():
        # Removed ext_status and ext_state as they were not defined and used only internally in import_external_audio
        # Also removed ext_state from the return
        return None, None, "", "", [], pd.DataFrame(columns=["Speaker","Utterance","Dialect","Clarification"]), "", ""
    clear_btn.click(fn=clear_all,
                    inputs=None,
                    outputs=[audio_input, ext_audio, ext_trans, df_out, correction, dialect_s])


ui.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4955be1cbf4702987c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
def run_feedback_loop(utterance, clarification_suggestion, dialect_suggestion):
    """
    Processes user feedback and updates the trust and interpretation agents.
    """
    feedback_data = {
        "Utterance": utterance,
        "SuggestedClarification": clarification_suggestion,
        "SuggestedDialect": dialect_suggestion
    }
    # Assuming the last processed utterance is available
    # In a real app, you'd tie feedback to a specific utterance/segment
    # For this demo, we'll use the suggestion as a proxy
    dialect_identified = dialect_suggestion # Use suggested dialect for belief update
    is_positive = True # Assume positive feedback for now, could be refined

    cid = agent4.record_feedback(feedback_data, positive=is_positive)
    agent4.audit_performance(agent2) # Trigger RLHF update with the feedback

    return f"✅ Feedback recorded. IPFS CID: {cid}"

# This function is a placeholder and should be adapted based on how you want
# to download the belief scores dataframe
def download_belief_scores():
  """
  Creates a temporary CSV file of the belief scores dataframe for download.
  """
  try:
    temp_file = tempfile.NamedTemporaryFile(suffix=".csv", delete=False)
    agent4.df.to_csv(temp_file.name, index=False)
    return temp_file.name
  except Exception as e:
    print(f"Error creating download file: {e}")
    return None

In [None]:
!gradio deploy

Need [32m'write'[0m access token to create a Spaces repo.

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

Enter your token (input will not be visible): 