<a href="https://colab.research.google.com/github/nattaran/HealthTequity-LLM/blob/main/Generate_Spanish_Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q deep-translator|

/bin/bash: -c: line 2: syntax error: unexpected end of file


# ============================================================
# üéß Generate Spanish Audio for HealthTequity Case Study
# ============================================================
# Author: Nasrin Attaran
# Description:
#   This script translates a list of English blood pressure‚Äìrelated questions
#   into Spanish, generates Spanish audio files using Google Text-to-Speech (gTTS),
#   and saves both `generated_questions.csv` and `ground_truth.csv` for later
#   ASR evaluation and model testing.



#Text -> Spanish Audio Generator (Clean Folder Each Run)


In [1]:
!pip install gTTS deep-translator pandas



In [9]:
from gtts import gTTS
from deep_translator import GoogleTranslator
import os
import pandas as pd


In [14]:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_ROOT = "/content/drive/MyDrive/HealthTequity-LLM"
AUDIO_OUTPUT_FOLDER = os.path.join(PROJECT_ROOT, "data/Spanish_audio")
GROUNDTRUTH_FOLDER  = os.path.join(PROJECT_ROOT, "data/synthetic_csv")

os.makedirs(AUDIO_OUTPUT_FOLDER, exist_ok=True)
os.makedirs(GROUNDTRUTH_FOLDER, exist_ok=True)

print(f"üéß Audio files path: {AUDIO_OUTPUT_FOLDER}")
print(f"üìÑ Ground truth path: {GROUNDTRUTH_FOLDER}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üéß Audio files path: /content/drive/MyDrive/HealthTequity-LLM/data/Spanish_audio
üìÑ Ground truth path: /content/drive/MyDrive/HealthTequity-LLM/data/synthetic_csv


In [13]:
ls

[0m[01;34mdata[0m/


In [19]:
# ============================================================
# üéß Generate Spanish Audio for HealthTequity Case Study
# ============================================================
# Author: Nasrin Attaran
# Description:
#   Translates English questions into Spanish, generates Spanish
#   audio (.wav), and saves a single ground_truth.csv file in
#   /data/synthetic_csv for ASR evaluation.
# ============================================================

!pip install gTTS deep-translator pandas

from gtts import gTTS
from deep_translator import GoogleTranslator
import os, pandas as pd

# ------------------------------------------------------------
# ‚öôÔ∏è Function Definition
# ------------------------------------------------------------
def generate_spanish_audio_from_english(
    english_questions: list[str],
    audio_output_folder: str,
    groundtruth_folder: str,
    prefix: str = "q"
) -> pd.DataFrame:
    """
    Translates English questions into Spanish, generates Spanish audio,
    and saves only 'ground_truth.csv' (no generated_questions.csv).

    Args:
        english_questions: List of English question strings.
        audio_output_folder: Folder to save generated .wav files.
        groundtruth_folder: Folder to save ground_truth.csv.
        prefix: Prefix for generated audio files (default 'q').

    Returns:
        DataFrame with Spanish transcriptions and filenames.
    """

    # ============================================================
    # üßπ Step 1 ‚Äì Prepare output folders
    # ============================================================
    os.makedirs(audio_output_folder, exist_ok=True)
    os.makedirs(groundtruth_folder, exist_ok=True)

    # Clean only .wav files in audio folder
    for f in os.listdir(audio_output_folder):
        if f.endswith(".wav"):
            os.remove(os.path.join(audio_output_folder, f))
    print(f"üìÅ Audio folder ready: {audio_output_folder}")
    print(f"üìÅ Ground truth folder ready: {groundtruth_folder}")

    # ============================================================
    # üåç Step 2 ‚Äì Translate English ‚Üí Spanish & generate audio
    # ============================================================
    translator = GoogleTranslator(source="en", target="es")
    results = []

    for i, q_en in enumerate(english_questions, 1):
        try:
            q_es = translator.translate(q_en)
            audio_file = f"{prefix}{i}_es.wav"
            audio_path = os.path.join(audio_output_folder, audio_file)

            tts = gTTS(text=q_es, lang="es")
            tts.save(audio_path)

            results.append({"audio_file": audio_file, "ground_truth": q_es})
            print(f"üéß {audio_file} ‚Üí {q_es}")

        except Exception as e:
            print(f"‚ö†Ô∏è Error generating audio for question {i}: {e}")

    # ============================================================
    # üßæ Step 3 ‚Äì Save Ground Truth Only
    # ============================================================
    gt_csv = os.path.join(groundtruth_folder, "ground_truth.csv")
    pd.DataFrame(results).to_csv(gt_csv, index=False, encoding="utf-8-sig")

    # ============================================================
    # ‚úÖ Step 4 ‚Äì Summary
    # ============================================================
    print(f"\n‚úÖ {len(results)} Spanish audio files generated.")
    print(f"üìÑ Ground truth CSV saved to: {gt_csv}")

    return pd.DataFrame(results)





In [20]:
english_questions = [
    "What are my systolic and diastolic blood pressures today?",
    "What were my blood pressure readings over the last week?",
    "What is the overall trend of my blood pressure this month?",
    "What are the normal blood pressure ranges for someone my age?",
    "What was my blood pressure on October 10th?",
    "What was my systolic blood pressure on October 12th?"
    "What was my diastolic blood pressure on September 30th? "
    "On which day did my systolic pressure exceed 140 mm Hg?",
    "Compare my average blood pressure from the first week to the last week of this month.",
    "What was the lowest diastolic blood pressure recorded this month?",
    "Has my blood pressure improved compared to last month?",
    "Show me the highest and lowest systolic values recorded so far."
    "What is the max and min of diastolic and systolic blood pressure this month?"
]


In [21]:
audio_summary_df = generate_spanish_audio_from_english(
    english_questions,
    audio_output_folder=AUDIO_OUTPUT_FOLDER,
    groundtruth_folder=GROUNDTRUTH_FOLDER
)

audio_summary_df



üìÅ Audio folder ready: /content/drive/MyDrive/HealthTequity-LLM/data/Spanish_audio
üìÅ Ground truth folder ready: /content/drive/MyDrive/HealthTequity-LLM/data/synthetic_csv
üéß q1_es.wav ‚Üí ¬øCu√°les son mis presiones arteriales sist√≥lica y diast√≥lica hoy?
üéß q2_es.wav ‚Üí ¬øCu√°les fueron mis lecturas de presi√≥n arterial durante la √∫ltima semana?
üéß q3_es.wav ‚Üí ¬øCu√°l es la tendencia general de mi presi√≥n arterial este mes?
üéß q4_es.wav ‚Üí ¬øCu√°les son los rangos normales de presi√≥n arterial para una persona de mi edad?
üéß q5_es.wav ‚Üí ¬øCu√°l era mi presi√≥n arterial el 10 de octubre?
üéß q6_es.wav ‚Üí ¬øCu√°l fue mi presi√≥n arterial sist√≥lica el 12 de octubre? ¬øCu√°l fue mi presi√≥n arterial diast√≥lica el 30 de septiembre? ¬øEn qu√© d√≠a mi presi√≥n sist√≥lica super√≥ los 140 mm Hg?
üéß q7_es.wav ‚Üí Compare mi presi√≥n arterial promedio desde la primera semana hasta la √∫ltima semana de este mes.
üéß q8_es.wav ‚Üí ¬øCu√°l fue la presi√≥n arterial di

Unnamed: 0,audio_file,ground_truth
0,q1_es.wav,¬øCu√°les son mis presiones arteriales sist√≥lica...
1,q2_es.wav,¬øCu√°les fueron mis lecturas de presi√≥n arteria...
2,q3_es.wav,¬øCu√°l es la tendencia general de mi presi√≥n ar...
3,q4_es.wav,¬øCu√°les son los rangos normales de presi√≥n art...
4,q5_es.wav,¬øCu√°l era mi presi√≥n arterial el 10 de octubre?
5,q6_es.wav,¬øCu√°l fue mi presi√≥n arterial sist√≥lica el 12 ...
6,q7_es.wav,Compare mi presi√≥n arterial promedio desde la ...
7,q8_es.wav,¬øCu√°l fue la presi√≥n arterial diast√≥lica m√°s b...
8,q9_es.wav,¬øHa mejorado mi presi√≥n arterial en comparaci√≥...
9,q10_es.wav,Mu√©strame los valores sist√≥licos m√°s altos y m...
