# Carga de audios, transcripción, limpieza y almacenamiento

In [None]:
import os
import mysql.connector
import whisper
import re

# Paths
audio_folder = "data/audio/"
transcription_folder = "data/transcriptions/"

# Connect to MySQL
conn = mysql.connector.connect(
    host="your-mysql-server",
    user="your-username",
    password="your-password",
    database="your-database"
)
cursor = conn.cursor()

# Load Whisper model
model = whisper.load_model("medium")

# Function to clean transcription text
def clean_transcription(text):
    """
    Cleans the transcription text by removing filler words, extra spaces,
    and unwanted artifacts.
    """
    # Define common filler words (can be extended)
    filler_words = ["uh", "um", "you know", "like", "so", "actually", "basically", "kind of", "sort of"]
    filler_pattern = r'\b(?:' + '|'.join(filler_words) + r')\b'

    # Remove filler words
    text = re.sub(filler_pattern, '', text, flags=re.IGNORECASE)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove unnecessary punctuation repetition (e.g., "Hello...." → "Hello.")
    text = re.sub(r'([.?!])\1+', r'\1', text)

    # Strip leading/trailing spaces
    text = text.strip()

    return text

for filename in os.listdir(audio_folder):
    if filename.endswith(".wav") or filename.endswith(".mp3"):
        audio_path = os.path.join(audio_folder, filename)
        transcription_path = os.path.join(transcription_folder, filename.replace(".wav", ".txt").replace(".mp3", ".txt"))

        # Transcribe audio
        result = model.transcribe(audio_path)
        raw_text = result["text"]

        # Clean transcription text
        cleaned_text = clean_transcription(raw_text)

        # Save cleaned transcription to a .txt file
        with open(transcription_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        # Insert cleaned transcription into MySQL
        sql = """
        INSERT INTO transcriptions (call_id, transcription, status)
        VALUES (%s, %s, %s)
        """
        values = (filename.replace(".wav", "").replace(".mp3", ""), cleaned_text, "Pending")

        cursor.execute(sql, values)
        conn.commit()

        print(f"✅ Saved cleaned transcription: {transcription_path} & stored in database.")

cursor.close()
conn.close()
