In [1]:
!pip install azure-cognitiveservices-speech

Collecting azure-cognitiveservices-speech
  Downloading azure_cognitiveservices_speech-1.48.2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting azure-core>=1.33.0 (from azure-cognitiveservices-speech)
  Downloading azure_core-1.38.2-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Downloading azure_cognitiveservices_speech-1.48.2-py3-none-manylinux1_x86_64.whl (35.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.4/35.4 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading azure_core-1.38.2-py3-none-any.whl (217 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.0/218.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: azure-core, azure-cognitiveservices-speech
Successfully installed azure-cognitiveservices-speech-1.48.2 azure-core-1.38.2


In [2]:
import os
import time
import azure.cognitiveservices.speech as speechsdk

In [23]:
#Secret Keys
os.environ["AZURE_SPEECH_KEY"] = "API_KEY_PLACEHOLDER"
os.environ["AZURE_SPEECH_REGION"] = "centralindia"

In [21]:
VOICE_MAP = {
    "hi-IN": "hi-IN-AaravNeural",
    "pa-IN": "pa-IN-OjasNeural",
    "bn-IN": "bn-IN-TanishaaNeural",
    "gu-IN": "gu-IN-DhwaniNeural",
    "ta-IN": "ta-IN-PallaviNeural",
}

In [22]:
TESTS = [
    # Hindi
    ("hi-IN", "hi", "native", "general", "आज मौसम साफ है और हम खेत में काम कर रहे हैं।"),
    ("hi-IN", "hi", "native", "agri", "गेहूं की फसल में पीले धब्बे दिख रहे हैं, क्या दवा डालनी चाहिए?"),
    ("hi-IN", "hi", "native", "numbers", "आज की तारीख 27/02/2026 है और मेरा मोबाइल नंबर 9876543210 है।"),
    ("hi-IN", "hi", "roman",  "roman_numbers", "Aaj ki tareekh 27/02/2026 hai aur mera mobile number 9876543210 hai."),

    # Punjabi
    ("pa-IN", "pa", "native", "general", "ਅੱਜ ਮੌਸਮ ਚੰਗਾ ਹੈ ਅਤੇ ਅਸੀਂ ਖੇਤ ਵਿੱਚ ਕੰਮ ਕਰ ਰਹੇ ਹਾਂ।"),
    ("pa-IN", "pa", "native", "numbers", "ਅੱਜ ਦੀ ਤਾਰੀਖ 27/02/2026 ਹੈ ਤੇ ਮੇਰਾ ਮੋਬਾਈਲ ਨੰਬਰ 9876543210 ਹੈ।"),
    ("pa-IN", "pa", "roman",  "roman_numbers", "Ajj di tareekh 27/02/2026 ae te mera mobile number 9876543210 ae."),

    # Bengali
    ("bn-IN", "bn", "native", "general", "আজ আবহাওয়া ভালো এবং আমরা মাঠে কাজ করছি।"),
    ("bn-IN", "bn", "native", "numbers", "আজকের তারিখ 27/02/2026 এবং আমার মোবাইল নম্বর 9876543210।"),
    ("bn-IN", "bn", "roman",  "roman_numbers", "Aajker tarikh 27/02/2026 ebong amar mobile number 9876543210."),

    # Gujarati
    ("gu-IN", "gu", "native", "general", "આજે હવામાન સારું છે અને અમે ખેતરમાં કામ કરી રહ્યા છીએ।"),
    ("gu-IN", "gu", "native", "numbers", "આજની તારીખ 27/02/2026 છે અને મારો મોબાઇલ નંબર 9876543210 છે।"),
    ("gu-IN", "gu", "roman",  "roman_numbers", "Aajni tarik 27/02/2026 chhe ane maro mobile number 9876543210 chhe."),

    # Tamil
    ("ta-IN", "ta", "native", "general", "இன்று வானிலை நல்லதாக உள்ளது, நாங்கள் வயலில் வேலை செய்கிறோம்."),
    ("ta-IN", "ta", "native", "numbers", "இன்றைய தேதி 27/02/2026 மற்றும் என் கைபேசி எண் 9876543210."),
    ("ta-IN", "ta", "roman",  "roman_numbers", "Inraiya thethi 27/02/2026; en mobile number 9876543210."),

    # Mixed / Hinglish (force Hindi voice for fairness)
    ("hi-IN", "mix", "mixed", "hinglish", "Kal mandi rate check karna hai, please update kar dena."),
]

In [24]:
speech_config = speechsdk.SpeechConfig(
    subscription=os.environ["AZURE_SPEECH_KEY"],
    region=os.environ["AZURE_SPEECH_REGION"]
)
speech_config.set_speech_synthesis_output_format(
    speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm
)

def synthesize_azure(locale, text, out_path):
    # Force correct voice
    speech_config.speech_synthesis_voice_name = VOICE_MAP[locale]

    audio_config = speechsdk.audio.AudioOutputConfig(filename=out_path)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

    t0 = time.time()
    result = synthesizer.speak_text_async(text).get()
    latency = time.time() - t0

    ok = (result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted)
    return ok, latency, str(result.reason)

In [25]:
MODEL_NAME = "azure_speech_tts"
os.makedirs("outputs", exist_ok=True)

rows = []
for locale, lang, script, tid, text in TESTS:
    fn = f"outputs/{MODEL_NAME}_{lang}_{script}_{tid}.wav"
    ok, latency, reason = synthesize_azure(locale, text, fn)
    rows.append((locale, lang, script, tid, ok, round(latency, 3), reason, fn))
    print(locale, lang, script, tid, "OK" if ok else "FAIL", "latency:", round(latency, 3))

hi-IN hi native general OK latency: 1.784
hi-IN hi native agri OK latency: 1.802
hi-IN hi native numbers OK latency: 1.814
hi-IN hi roman roman_numbers OK latency: 1.937
pa-IN pa native general OK latency: 1.956
pa-IN pa native numbers OK latency: 2.451
pa-IN pa roman roman_numbers OK latency: 2.137
bn-IN bn native general OK latency: 1.708
bn-IN bn native numbers OK latency: 2.087
bn-IN bn roman roman_numbers OK latency: 2.046
gu-IN gu native general OK latency: 1.917
gu-IN gu native numbers OK latency: 2.527
gu-IN gu roman roman_numbers OK latency: 2.646
ta-IN ta native general OK latency: 1.69
ta-IN ta native numbers OK latency: 2.029
ta-IN ta roman roman_numbers OK latency: 1.941
hi-IN mix mixed hinglish OK latency: 1.643


In [26]:
import pandas as pd
df = pd.DataFrame(rows, columns=["locale","lang","script","test_id","ok","latency_s","reason","file"])
df

Unnamed: 0,locale,lang,script,test_id,ok,latency_s,reason,file
0,hi-IN,hi,native,general,True,1.784,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_hi_native_general.wav
1,hi-IN,hi,native,agri,True,1.802,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_hi_native_agri.wav
2,hi-IN,hi,native,numbers,True,1.814,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_hi_native_numbers.wav
3,hi-IN,hi,roman,roman_numbers,True,1.937,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_hi_roman_roman_number...
4,pa-IN,pa,native,general,True,1.956,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_pa_native_general.wav
5,pa-IN,pa,native,numbers,True,2.451,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_pa_native_numbers.wav
6,pa-IN,pa,roman,roman_numbers,True,2.137,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_pa_roman_roman_number...
7,bn-IN,bn,native,general,True,1.708,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_bn_native_general.wav
8,bn-IN,bn,native,numbers,True,2.087,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_bn_native_numbers.wav
9,bn-IN,bn,roman,roman_numbers,True,2.046,ResultReason.SynthesizingAudioCompleted,outputs/azure_speech_tts_bn_roman_roman_number...
