<a href="https://colab.research.google.com/github/vyomakesh0728/telugu_tts/blob/main/Orpheus_speaks_telugu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!uv pip install -q --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!uv pip install -q sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!uv pip install -q transformers==4.51.3
!uv pip install -q --no-deps unsloth
!uv pip install -q snac

In [3]:
from datasets import load_dataset, Audio
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from unsloth import FastLanguageModel, is_bfloat16_supported
from accelerate import infer_auto_device_map, load_checkpoint_and_dispatch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torchaudio.transforms as T, torch, os
from snac import SNAC


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel, is_bfloat16_supported


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
BASE_MODEL      = "canopylabs/orpheus-3b-0.1-pretrained"  # HF ID :contentReference[oaicite:2]{index=2}
TARGET_SR       = 24_000
MAX_SEQ_LEN     = 8192                                     # fits A100-40 GB


In [5]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `edoti` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `edoti`


In [9]:
import os, torch
from accelerate import infer_auto_device_map, load_checkpoint_and_dispatch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# ── 0. Fragmentation guard (optional) ───────────────────────────────────
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"

# ── 1. Prepare BitsAndBytes 4-bit config ───────────────────────────────
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# ── 2. Load & quantize model directly from Hub ──────────────────────────
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
    quantization_config=bnb_cfg,
    device_map="auto",             # let Accelerate infer
    offload_folder="offload",      # where to put CPU-offloaded weights
    offload_state_dict=True,       # enable spilling
)

# ── 4. Attach LoRA adapters as before ──────────────────────────────────
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
lora_cfg = LoraConfig(
    r=16, lora_alpha=32,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","down_proj","up_proj"],
    bias="none",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 24,313,856 || all params: 3,325,177,856 || trainable%: 0.7312


In [10]:
from datasets import load_dataset, Audio
from transformers import AutoTokenizer
import torch
import torchaudio.transforms as T
import torch.nn.functional as F
from snac import SNAC

# ─── Config ───────────────────────────────────────────────────────────────
TARGET_SR = 24_000
BATCH_SZ  = 16
MODEL_ID  = "hubertsiuzdak/snac_24khz"  # or whatever tokenizer backbone you need

# ─── 1. Load map‐style Dataset (no streaming) ──────────────────────────────
raw_ds = load_dataset("dvyomkesh/Indic-r", split="train")

# ─── 2. Keep only text/audio/gender & filter for female speakers ──────────
raw_ds = (
    raw_ds
    .select_columns(["text", "audio", "gender"])
    .filter(lambda ex: ex["gender"].strip().lower() == "female")
    .cast_column("audio", Audio(sampling_rate=TARGET_SR))
)

# ─── 3. Prepare SNAC codec & tokenizer on GPU ──────────────────────────────
snac      = SNAC.from_pretrained(MODEL_ID).eval().to("cuda")
resampler = T.Resample(orig_freq=TARGET_SR, new_freq=TARGET_SR).to("cuda")

# ─── 4. Batch preprocessing function ───────────────────────────────────────
def preprocess_batch(batch):
    # — Load & pad waveforms to the same length
    wavs = [torch.from_numpy(a["array"]).float().to("cuda") for a in batch["audio"]]
    max_len = max(w.shape[-1] for w in wavs)
    padded = [F.pad(w, (0, max_len - w.shape[-1])) for w in wavs]
    wavs_tensor = torch.stack(padded).unsqueeze(1)  # [B,1,T]

    # — SNAC encode all at once
    with torch.no_grad():
        codes = snac.encode(wavs_tensor)  # [B, levels, frames]

    # — Build input_ids & labels
    input_ids, label_ids = [], []
    for text, code in zip(batch["text"], codes):
        txt_ids   = tokenizer(text, add_special_tokens=False)["input_ids"]
        audio_ids = [
            tokenizer.convert_tokens_to_ids(f"<A_{c}>")
            for c in torch.flatten(code.transpose(0,1)).tolist()
        ]
        seq = [tokenizer.bos_token_id] + txt_ids + [tokenizer.eos_token_id] + audio_ids
        input_ids.append(seq)
        label_ids.append(seq)

    return {"input_ids": input_ids, "labels": label_ids}

# ─── 5. Map & batch preprocess ─────────────────────────────────────────────
proc_ds = raw_ds.map(
    preprocess_batch,
    batched=True,
    batch_size=BATCH_SZ,
    remove_columns=["text", "audio", "gender"],
    desc="SNAC encoding on GPU"
)

# ─── 6. Quick sanity check ────────────────────────────────────────────────
print(f"Processed {len(proc_ds)} examples")
print(proc_ds.features)
print("Example 0 lengths:", len(proc_ds[0]["input_ids"]), len(proc_ds[0]["labels"]))


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]



SNAC encoding on GPU:   0%|          | 0/5216 [00:00<?, ? examples/s]

Processed 978 examples
{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
Example 0 lengths: 5550 5550


In [26]:
proc_ds = proc_ds.filter(
    lambda ex: ex["input_ids"] is not None and ex["labels"] is not None
)
print("After dropping None:", len(proc_ds))

Filter:   0%|          | 0/978 [00:00<?, ? examples/s]

After dropping None: 978


In [27]:
for idx in range(1):
    ex = proc_ds[idx]
    print(f"Example {idx}:")
    print("  input_ids:", type(ex["input_ids"]), len(ex["input_ids"]), ex["input_ids"])
    print("  labels:   ", type(ex["labels"]),    len(ex["labels"]),    ex["labels"])

Example 0:
  input_ids: <class 'list'> 5550 [128000, 32405, 230, 32405, 103, 32405, 123, 32405, 236, 32405, 104, 53898, 235, 32405, 241, 32405, 97, 53898, 233, 94355, 101, 32405, 106, 53898, 233, 32405, 99, 53898, 230, 32405, 101, 94355, 237, 32405, 94, 53898, 223, 94355, 101, 32405, 122, 32405, 110, 53898, 223, 32405, 245, 53898, 223, 94355, 106, 53898, 224, 32405, 94, 53898, 223, 94355, 101, 32405, 122, 32405, 110, 53898, 223, 32405, 245, 53898, 223, 94355, 108, 53898, 228, 32405, 224, 32405, 94, 53898, 223, 94355, 108, 53898, 228, 32405, 224, 32405, 94, 53898, 223, 94355, 101, 32405, 122, 32405, 110, 53898, 223, 32405, 245, 53898, 223, 94355, 236, 32405, 101, 32405, 123, 32405, 106, 32405, 123, 32405, 99, 32405, 123, 94355, 238, 32405, 99, 53898, 223, 94355, 101, 32405, 122, 32405, 110, 53898, 223, 32405, 245, 53898, 223, 94355, 101, 32405, 122, 32405, 110, 53898, 223, 32405, 245, 53898, 223, 94355, 104, 53898, 233, 32405, 101, 53898, 235, 94355, 101, 32405, 224, 32405, 105, 32405, 

In [18]:
import torch

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
pad_id = tokenizer.pad_token_id

def collate_fn(batch):
    # ── 1. Filter invalid examples ──────────────────────────
    good = []
    for ex in batch:
        ids, labs = ex.get("input_ids"), ex.get("labels")
        if isinstance(ids, list) and all(isinstance(x, int) for x in ids) \
        and isinstance(labs, list) and all(isinstance(x, int) for x in labs):
            good.append(ex)
        else:
            print("[collate_fn] Dropping invalid example:", ex)
    if not good:
        raise ValueError("No valid examples in this batch!")

    # ── 2. Extract and pad ─────────────────────────────────
    input_lists = [ex["input_ids"] for ex in good]
    label_lists = [ex["labels"]    for ex in good]
    max_len = max(len(ids) for ids in input_lists)

    padded_inputs, padded_labels, attn_masks = [], [], []
    for inp, lab in zip(input_lists, label_lists):
        pad_len = max_len - len(inp)
        padded_inputs.append( inp + [pad_id] * pad_len )
        padded_labels.append( lab  + [pad_id] * pad_len )
        attn_masks.append( [1]*len(inp) + [0]*pad_len )

    # ── 3. Convert to tensors ───────────────────────────────
    input_ids      = torch.tensor(padded_inputs,   dtype=torch.long)
    attention_mask = torch.tensor(attn_masks,      dtype=torch.long)
    labels         = torch.tensor(padded_labels,   dtype=torch.long)

    # ── 4. Mask pad tokens in labels ────────────────────────
    labels[labels == pad_id] = -100

    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "labels":         labels,
    }


In [14]:
from torch.utils.data import DataLoader

# 3. Wrap in PyTorch DataLoader with your collator
train_loader = DataLoader(proc_ds, batch_size=None, collate_fn=collate_fn)  # batch inside preprocess


In [25]:
from datasets import load_dataset

raw = load_dataset("dvyomkesh/Indic-r", split="train")
females = raw.filter(lambda ex: ex["gender"].strip().lower() == "female")
print("Female count:", len(females))   # should print 978

# Look at a few entries:
for i in range(3):
    ex = females[i]
    print(i, ex["text"], ex["gender"], ex["audio"]["array"].shape)

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Filter:   0%|          | 0/10488 [00:00<?, ? examples/s]

Female count: 5216
0 ఈపిఎఫ్ఓతో నమోదైన ఏడు నాలుగు మూడు నాలుగు రెండు రెండు నాలుగు ఎనిమిది ఐదు నాలుగు నాలుగు ఫోన్ నంబర్కి వచ్చిన ఆరు అంకెల ఓటీపీ చెప్తావా Female (560400,)
1 కొంతమంది నన్ను అడిగారు అయితే మా ఆఫీసు లో సవాళ్లతో కూడిన ప్రాజెక్ట్లు ఎన్నో వచ్చాయి అయితే ఆ ప్రాజెక్ట్లు ఎలా ఉన్నాయంటే Female (551376,)
2 పక్క కంపెనీ వాళ్ళకి మాకు పోటీగా ఉండేది అయితే ఆ కంపెనీ వాళ్ళ కన్నా Female (256512,)


In [24]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir                  = "/content/drive/MyDrive/orpheus-telugu-emo",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    max_steps                   = 800,
    learning_rate               = 2e-4,
    warmup_ratio                = 0.1,
    logging_steps               = 10,
    save_steps                  = 200,
    fp16                        = True,
    optim                       = "adamw_torch",
    report_to                   = "none",
    seed                        = 3407,
)

trainer = Trainer(
    model           = model,
    args            = args,
    train_dataset   = proc_ds,
    data_collator   = collate_fn,    # our custom collator
)

trainer.train()


TypeError: 'NoneType' object cannot be interpreted as an integer

In [None]:
trainer.save_model(args.output_dir)         # full-merged weights if you disabled 4-bit
tokenizer.save_pretrained(args.output_dir)

# LoRA-only:  model.save_pretrained("orpheus-telugu-emo-lora")
#             model.push_to_hub("your-handle/orpheus-telugu-emo-lora", private=True)


In [None]:
from snac import SNAC
from scipy.io.wavfile import write

model.eval(); snac = SNAC.from_pretrained(SNAC_MODEL_ID).to("cuda")
prompt = "<HAPPY>  నేను మిమ్మల్ని చూసి చాలా సంతోషంగా ఉన్నాను!"  # Telugu text
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
    gen_ids = model.generate(**inputs, max_new_tokens=1200)[0]   # text + audio tokens
audio_tokens = [id for id in gen_ids.tolist() if tokenizer.decode([id]).startswith("<A_")]
codes = torch.tensor([int(tok[3:-1]) for tok in tokenizer.batch_decode(audio_tokens)])
waveform = snac.decode(codes.unsqueeze(0))[0].cpu().numpy()
write("output.wav", TARGET_SR, waveform)
