<a href="https://colab.research.google.com/github/vyomakesh0728/telugu_tts/blob/main/telugu_emotion_speech_transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# 1. Install dependencies
!uv pip install -q kagglehub openai-whisper polars tqdm

In [3]:
# 2. Download dataset
import kagglehub, pathlib, glob, polars as pl, tqdm, os, shutil

ds_path = pathlib.Path(
    kagglehub.dataset_download("jettysowmith/telugu-emotion-speech")
)
wav_files = sorted(glob.glob(str(ds_path / "**/*.wav"), recursive=True))
print(wav_files[:5])

Downloading from https://www.kaggle.com/api/v1/datasets/download/jettysowmith/telugu-emotion-speech?dataset_version_number=1...


100%|██████████| 201M/201M [00:01<00:00, 193MB/s]

Extracting files...





['/root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/angry/S11_ALA_C01_G1_D02_SPKM04_V1_AN5_MMM.wav', '/root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/angry/S11_ALA_C05_G1_D06_SPKF03_V1_AN5_MMM.wav', '/root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/angry/S11_ALA_C06_G2_D03_SPKF03_V1_AN5_MMM.wav', '/root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/angry/S11_ALA_C10_G1_D01_SPKF03_V1_AN5_MMM.wav', '/root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/angry/S11_ALA_C14_G1_D05_SPKM05_V2_AN5_MMM.wav']


In [7]:
# 3. Prepare output dir for reconstructed wavs
out_dir = "/content/drive/MyDrive/wav_bytes_reconstructed"
os.makedirs(out_dir, exist_ok=True)


In [5]:
# 4. Load checkpoint if exists, else start new
checkpoint_path = "/content/drive/MyDrive/telugu_emotion_partial_with_wav.parquet"
rows = []
done = set()
try:
    prev_df = pl.read_parquet(checkpoint_path)
    rows = list(prev_df.iter_rows())
    done = set(prev_df["audio_path"])
    print(f"Resuming from checkpoint: {len(rows)} files already processed.")
except Exception as e:
    print("No checkpoint found. Starting fresh.")


No checkpoint found. Starting fresh.


In [6]:
# 5. Load Whisper (large-v3)
import whisper, torch

device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"

model = whisper.load_model("large-v3", device=device)






100%|█████████████████████████████████████| 2.88G/2.88G [01:04<00:00, 48.2MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [8]:
# 6. Loop over all .wav files; write results after each
labels = (
    ["<angry>"] * 96
    + ["<happy>"] * 94
    + ["<neutral>"] * 103
    + ["<sad>"] * 80
    + ["<surprised>"] * 85
)
assert len(labels) == len(wav_files), "Labels list must match wav_files!"

rows = []
checkpoint_path = "/content/drive/MyDrive/telugu_emotion_partial_with_wav.parquet"

for idx, w in enumerate(tqdm.tqdm(wav_files)):
    filename = f"audio_{idx+1:05d}.wav"
    new_wav_path = os.path.join(out_dir, filename)
    if new_wav_path in done:
        continue
    try:
        # Read audio bytes and reconstruct into out_dir
        with open(w, "rb") as f:
            wav_bytes = f.read()
        with open(new_wav_path, "wb") as f:
            f.write(wav_bytes)
        # Transcribe with Whisper
        result = model.transcribe(new_wav_path, language="te", fp16=True)
        text = result["text"].strip()
        # Assign emotion label based on index
        emotion = labels[idx]
        # Save result immediately to rows and to checkpoint
        rows.append((wav_bytes, text, emotion))
        pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
    except Exception as e:
        print(f"Error processing {w}: {e}")
        continue

  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  1%|          | 3/458 [00:17<40:20,  5.32s/it]

Error processing /root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/angry/S11_ALA_C06_G2_D03_SPKF03_V1_AN5_MMM.wav: unexpected value while building Series of type Binary; found value of type String: "ఏయ్ గన్ను నాదగరు ఉంటే కోషలు నువ్ ఏస్తావేంట్ రా ఇరిటేట్ చేకు ఇరిటేట్ చేస్తే చంపేస్తా ఓకే"

Hint: Try setting `strict=False` to allow passing data with mixed types.


  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataF

Error processing /root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/sad/S45_SRI_C01_G2_D04_SPKF21_V1_SA4_MMM.wav: Failed to load audio: ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnap

  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(checkpoint_path)
  pl.DataF

In [None]:
# 6. Loop over all .wav files; write results after each
for idx, w in enumerate(tqdm.tqdm(wav_files)):
    # Use only files not already processed
    filename = f"audio_{idx+1:05d}.wav"
    new_wav_path = os.path.join(out_dir, filename)
    if new_wav_path in done:
        continue
    try:
        # Read audio bytes and reconstruct into out_dir
        with open(w, "rb") as f:
            wav_bytes = f.read()
        with open(new_wav_path, "wb") as f:
            f.write(wav_bytes)
        # Transcribe with Whisper
        result = model.transcribe(new_wav_path, language="te", fp16=True)
        text = result["text"].strip()
        # Save result immediately to rows and to checkpoint
        rows.append((wav_bytes, text))
        pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
    except Exception as e:
        print(f"Error processing {w}: {e}")
        continue

  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  0%|          | 2/458 [00:12<43:16,  5.69s/it]  

Error processing /root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/angry/S11_ALA_C05_G1_D06_SPKF03_V1_AN5_MMM.wav: unexpected value while building Series of type Binary; found value of type String: "ఏయ్ గన్ను నాదగరు ఉంటే కోషలు నువ్ ఏస్తావేంట్ రా ఇరిటేట్ చేకు ఇరిటేట్ చేస్తే చంపేస్తా ఓకే"

Hint: Try setting `strict=False` to allow passing data with mixed types.


  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(che

Error processing /root/.cache/kagglehub/datasets/jettysowmith/telugu-emotion-speech/versions/1/telugu/sad/S45_SRI_C01_G2_D04_SPKF21_V1_SA4_MMM.wav: Failed to load audio: ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnap

  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(checkpoint_path)
  pl.DataFrame(rows, schema=["audio", "text"]).write_parquet(che

In [9]:

# 7. Final save
final_path = "telugu_emotion_speech_allfiles_with_wav.parquet"
pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(final_path)
print(f"Done! Total processed: {len(rows)} files. Output saved to {final_path}")

  pl.DataFrame(rows, schema=["audio", "text", "emotions"]).write_parquet(final_path)


Done! Total processed: 456 files. Output saved to telugu_emotion_speech_allfiles_with_wav.parquet


In [10]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `edoti` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `edoti`


In [12]:
!uv pip install -q datasets

In [13]:
from datasets import Dataset, Features, Audio, Value
from pathlib import Path
import polars as pl

# -----------------
# 1. Build a Dataset object
# -----------------
df = pl.read_parquet("telugu_emotion_speech_allfiles_with_wav.parquet")        # or build the list of dicts yourself
features = Features({"audio": Audio(), "text": Value("string"), "emotions":Value("string")})

hf_ds = Dataset.from_dict(df.to_dict(as_series=False), features=features)

# -----------------
# 2. Push to the Hub
#    – option A: keep everything inside Parquet
# -----------------
hf_ds.push_to_hub(
    "dvyomkesh/telugu_es_transcription",
    embed_external_files=True,   # default; safe to omit
    private=False                # or True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/456 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dvyomkesh/telugu_es_transcription/commit/c8c8eaa477baf43e76f91f49a92e12f7bfa9a490', commit_message='Upload dataset', commit_description='', oid='c8c8eaa477baf43e76f91f49a92e12f7bfa9a490', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dvyomkesh/telugu_es_transcription', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dvyomkesh/telugu_es_transcription'), pr_revision=None, pr_num=None)