<a href="https://colab.research.google.com/github/vyomakesh0728/telugu_tts/blob/main/sarvam_got_emotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -q --upgrade \
    "transformers[torch]>=4.40" peft bitsandbytes accelerate \
    datasets polars ninja



Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.8/422.8 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m

In [50]:
import transformers, inspect, sys, os
print("Transformers version →", transformers.__version__)
print("Loaded from          →", os.path.abspath(transformers.__file__))


Transformers version → 4.52.3
Loaded from          → /usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [51]:
import inspect, transformers
print("Transformers =", transformers.__version__)
print("evaluation_strategy" in inspect.signature(transformers.TrainingArguments.__init__).parameters)


Transformers = 4.52.3
False


In [21]:
import torch, os, random, time
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          BitsAndBytesConfig, TrainingArguments, Trainer,
                          TrainerCallback, set_seed)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import gc


import torch
if torch.cuda.is_available():
    torch.cuda.set_device(0)


def try_load_model(model_id, bnb_cfg, tokenizer, emotion_tokens, lora_cfg, max_retries=3):
    for attempt in range(max_retries):
        try:
            print(f"\n[Attempt {attempt+1}] Loading model...")
            # --- CLEAR VRAM/RAM ---
            torch.cuda.empty_cache()
            gc.collect()
            time.sleep(2)

            # --- Load tokenizer + resize for extra tokens ---
            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

            if tokenizer.pad_token_id is None:
                tokenizer.pad_token = tokenizer.eos_token
            if tokenizer.add_tokens(emotion_tokens) > 0:
                print("Added emotion tokens to vocab")


            # --- Try loading model ---
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch.bfloat16,
                quantization_config=bnb_cfg,
                device_map="auto",  # auto-partition across GPU/CPU
            )

            model = prepare_model_for_kbit_training(
                model,
                use_gradient_checkpointing=True     # keeps VRAM low
            )

            model.resize_token_embeddings(len(tokenizer))
            model.gradient_checkpointing_enable()
            print("Model loaded: modules are split across GPU/CPU as needed.")

            # --- Attach LoRA adapters ---
            model = get_peft_model(model, lora_cfg)
            n_requires_grad = sum(p.requires_grad for p in model.parameters())
            n_total = sum(1 for _ in model.parameters())
            print(f"{n_requires_grad} / {n_total} parameters require grad")
            assert n_requires_grad > 0, "No parameters require grad! Something is frozen or LoRA not set up."
            print("LoRA adapters attached.")

            for n, p in model.named_parameters():
              if p.requires_grad:
                print("Trainable:", n, p.shape)
            assert any(p.requires_grad for p in model.parameters()), "No parameters require grad!"


            return model, tokenizer

        except RuntimeError as e:
            if "out of memory" in str(e) or "CUDA out of memory" in str(e):
                print("[WARN] CUDA OOM! Attempting to clear memory and offload more to CPU...")
                torch.cuda.empty_cache()
                gc.collect()
                time.sleep(2)
                continue  # retry up to max_retries
            else:
                raise e  # any other runtime error should be surfaced
    raise RuntimeError("Failed to load model after multiple attempts due to insufficient GPU memory.")





In [22]:
# --- CONFIGURATION ---
model_id = "sarvamai/sarvam-m"
emotion_tokens = ["<angry>", "<happy>", "<neutral>", "<sad>", "<surprised>"]

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True,  # <== CRUCIAL for hybrid CPU/GPU
)

lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    task_type="CAUSAL_LM",
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)


# --- SMART MODEL LOADING ---
model, tokenizer = try_load_model(model_id, bnb_cfg, tokenizer, emotion_tokens, lora_cfg)



[Attempt 1] Loading model...
Added emotion tokens to vocab


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Model loaded: modules are split across GPU/CPU as needed.
320 / 683 parameters require grad
LoRA adapters attached.
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.Size([16, 5120])
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 16])
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.Size([16, 5120])
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.Size([1024, 16])
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.Size([16, 5120])
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.Size([1024, 16])
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight torch.Size([16, 4096])
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight torch.Size([5120, 16])
Trainable: base_model.model.model.layers.1.s

In [23]:
print("Tokenizer type:", type(tokenizer))
print("Vocab size:", tokenizer.vocab_size)
print("Special tokens:", tokenizer.special_tokens_map)
print("All added tokens:", tokenizer.additional_special_tokens)

for token in emotion_tokens:
    print(f"Token '{token}':", tokenizer.convert_tokens_to_ids(token))


test_text = "Emotion: <happy>\nText: This is a test."
tok_out = tokenizer(test_text)
print("Input IDs:", tok_out["input_ids"])
print("Decoded:", tokenizer.decode(tok_out["input_ids"]))


print("Tokenizer files:", tokenizer.pretrained_vocab_files_map)
print("Tokenizer name or path:", tokenizer.name_or_path)

tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    # Check if <pad> is in vocab, otherwise use eos_token
    if "<pad>" in tokenizer.get_vocab():
        tokenizer.pad_token = "<pad>"
    else:
        tokenizer.pad_token = tokenizer.eos_token

tok_out = tokenizer("Hello world!", max_length=16, padding="max_length", truncation=True)
print(tok_out)



Tokenizer type: <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>
Vocab size: 131072
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>', 'additional_special_tokens': ['<unk>', '<s>', '</s>', '[INST]', '[/INST]', '[AVAILABLE_TOOLS]', '[/AVAILABLE_TOOLS]', '[TOOL_RESULTS]', '[/TOOL_RESULTS]', '[TOOL_CALLS]', '[IMG]', '<pad>', '[IMG_BREAK]', '[IMG_END]', '[PREFIX]', '[MIDDLE]', '[SUFFIX]', '[SYSTEM_PROMPT]', '[/SYSTEM_PROMPT]', '[TOOL_CONTENT]', '<SPECIAL_20>', '<SPECIAL_21>', '<SPECIAL_22>', '<SPECIAL_23>', '<SPECIAL_24>', '<SPECIAL_25>', '<SPECIAL_26>', '<SPECIAL_27>', '<SPECIAL_28>', '<SPECIAL_29>', '<SPECIAL_30>', '<SPECIAL_31>', '<SPECIAL_32>', '<SPECIAL_33>', '<SPECIAL_34>', '<SPECIAL_35>', '<SPECIAL_36>', '<SPECIAL_37>', '<SPECIAL_38>', '<SPECIAL_39>', '<SPECIAL_40>', '<SPECIAL_41>', '<SPECIAL_42>', '<SPECIAL_43>', '<SPECIAL_44>', '<SPECIAL_45>', '<SPECIAL_46>', '<SPECIAL_47>', '<SPECIAL_48>', '<SPECIAL_4

In [39]:
# add identifiers for the dataset, ik this is a very bad habbit but just for now :)
def idx2emotion(i: int) -> str:
    if   i < 95:   return "angry"
    elif i < 189:  return "happy"
    elif i < 292:  return "neutral"
    elif i < 372:  return "sad"
    else:          return "surprised"

def add_emotion(example, idx):
    example["emotion"] = idx2emotion(idx)
    return example

raw = load_dataset("dvyomkesh/telugu_es_transcription", split="train")          \
        .map(add_emotion, with_indices=True, num_proc=4)

# now split
data = raw.train_test_split(test_size=0.1, seed=42)
train_ds, val_ds = data["train"], data["test"]

# tokenise (no with_indices needed any more)
def tokenise(example):
    text = f"Emotion: <{example['emotion']}>\nText: {example['text']}"
    tok  = tokenizer(text, max_length=256, truncation=True,
                     padding="max_length", return_attention_mask=True)
    example["input_ids"]      = tok["input_ids"]
    example["attention_mask"] = tok["attention_mask"]
    example["labels"]         = tok["input_ids"]
    return example

ds_train = train_ds.map(tokenise, num_proc=4).shuffle(seed=42)
ds_val   = val_ds.map(tokenise, num_proc=4).shuffle(seed=42)
print(ds_train[0])
print(ds_val[5])




Map (num_proc=4):   0%|          | 0/456 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/410 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/46 [00:00<?, ? examples/s]

{'audio': {'path': None, 'array': array([-1.98364258e-04, -7.62939453e-05, -1.83105469e-04, ...,
       -3.05175781e-05, -9.15527344e-05,  1.22070312e-04]), 'sampling_rate': 44100}, 'text': 'ఇక్కరు స్టూడింస్ కొట్టుకు చస్తుంటే ఇది కోలేజ్ ప్రాబ్లుమ్ ఎల్లా హోతుంది లోన్ అడా ప్రాబ్లుమ్ హోతుంది', 'emotion': 'neutral', 'input_ids': [1, 5969, 15984, 1058, 1032, 131074, 1010, 3210, 1058, 18364, 28220, 18504, 57726, 26967, 11036, 1184, 2040, 34460, 52024, 41453, 13047, 33031, 2404, 59251, 101612, 48344, 6241, 5027, 21594, 2040, 36552, 1172, 10778, 85194, 2040, 29332, 75714, 2205, 29632, 3776, 7279, 23347, 27880, 16588, 8914, 7577, 2205, 36552, 1172, 10778, 85194, 2040, 29632, 3776, 7279, 23347, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [40]:
print(ds_train[0]['input_ids'])
print(ds_val[0]['input_ids'])

[1, 5969, 15984, 1058, 1032, 131074, 1010, 3210, 1058, 18364, 28220, 18504, 57726, 26967, 11036, 1184, 2040, 34460, 52024, 41453, 13047, 33031, 2404, 59251, 101612, 48344, 6241, 5027, 21594, 2040, 36552, 1172, 10778, 85194, 2040, 29332, 75714, 2205, 29632, 3776, 7279, 23347, 27880, 16588, 8914, 7577, 2205, 36552, 1172, 10778, 85194, 2040, 29632, 3776, 7279, 23347, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[1, 5969, 15984, 1058, 1032, 1310

In [41]:
# Data collator that keeps the 'emotion' field
def collate(features):
    input_ids      = torch.tensor([f["input_ids"]      for f in features], dtype=torch.long)
    attention_mask = torch.tensor([f["attention_mask"] for f in features], dtype=torch.long)
    labels         = torch.tensor([f["labels"]         for f in features], dtype=torch.long)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }



In [27]:
# callbacks for live loss printing
class StepLossPrinter(TrainerCallback):
    """Print raw loss + EMA every optimisation step."""
    def __init__(self, beta=0.98):
        self.beta, self.ema = beta, None
    def on_log(self, args, state, control, logs=None, **kw):
        if logs and "loss" in logs:
            loss = logs["loss"]
            self.ema = loss if self.ema is None else self.beta*self.ema + (1-self.beta)*loss
            print(f"step {state.global_step:6} │ loss {loss:6.4f} │ smoothed {self.ema:6.4f}")

class EmotionLossTracker(TrainerCallback):
    """Bucket losses by emotion for a quick post-mortem."""
    def __init__(self, emotions=emotion_tokens):
        self.buckets = {e.strip("<>"): [] for e in emotions}
    def on_train_batch_end(self, args, state, control, **kw):
        emolist = kw["inputs"]["emotion"]
        loss    = kw["outputs"].loss.item()
        for e in emolist:                       # same loss for every sample in batch
            self.buckets[e].append(loss)
    def on_train_end(self, args, state, control, **kw):
        print("\n=== average loss by emotion ===")
        for e, v in self.buckets.items():
            if v:
                print(f"{e:10}: {sum(v)/len(v):6.4f}")


In [52]:
checkpoint_dir = "/content/drive/MyDrive/sarvam-m-emo-qlora"
ARGS = TrainingArguments(
    output_dir        = checkpoint_dir,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 4,       # effective 32
    num_train_epochs  = 5,
    learning_rate     = 2e-4,
    lr_scheduler_type = "cosine",
    warmup_ratio      = 0.03,
    bf16              = True,
    logging_steps     = 1,                 # so StepLossPrinter fires each step
    save_steps        = 500,
    save_total_limit  = 3,
    report_to         = "none",
    gradient_checkpointing = True,
)


In [43]:
# Inspect a batch from your collator
sample_batch = collate([ds_train[0]])
print("Batch keys:", sample_batch.keys())
print("input_ids dtype:", sample_batch['input_ids'].dtype)
print("labels dtype:", sample_batch['labels'].dtype)
print("Shapes:", sample_batch['input_ids'].shape, sample_batch['labels'].shape)
print("First few input_ids:", sample_batch['input_ids'][0][:10])
print("First few labels:", sample_batch['labels'][0][:10])
print("labels present in ds[0]:", "labels" in ds_train[0], "Shape:", len(ds_train[0]['labels']))


Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids dtype: torch.int64
labels dtype: torch.int64
Shapes: torch.Size([1, 256]) torch.Size([1, 256])
First few input_ids: tensor([     1,   5969,  15984,   1058,   1032, 131074,   1010,   3210,   1058,
         18364])
First few labels: tensor([     1,   5969,  15984,   1058,   1032, 131074,   1010,   3210,   1058,
         18364])
labels present in ds[0]: True Shape: 256


In [53]:
import torch

# Sample batch from dataset
sample = ds_train[0]
# If your collate fn returns a batch, do this:
batch = {
    "input_ids": torch.tensor([sample["input_ids"]], dtype=torch.long),
    "attention_mask": torch.tensor([sample["attention_mask"]], dtype=torch.long),
    "labels": torch.tensor([sample["labels"]], dtype=torch.long),
}
# Move batch to the model's device
device = next(model.parameters()).device
batch = {k: v.to(device) for k, v in batch.items()}

model.train()
outputs = model(
    input_ids=batch["input_ids"],
    attention_mask=batch["attention_mask"],
    labels=batch["labels"]
)
loss = outputs.loss
print("Loss:", loss.item())
loss.backward()  # <--- Should NOT throw!
print("Backward pass succeeded!")


Loss: 0.5140984058380127
Backward pass succeeded!


In [59]:
last_chkpt = None
if os.path.isdir(checkpoint_dir):
    ckpts = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
    if ckpts:
        last_chkpt = os.path.join(checkpoint_dir, sorted(ckpts, key=lambda x:int(x.split('-')[-1]))[-1])
        print(f"Resuming from {last_chkpt}")


def eval(eval_pred):
    """
    eval_pred is a transformers.EvalPrediction
    eval_pred.predictions → logits (np.ndarray)
    eval_pred.label_ids   → labels  (np.ndarray)
    Return a dict of metric_name: value pairs.
    """
    # example: simple accuracy
    import numpy as np
    preds = np.argmax(eval_pred.predictions, axis=-1)
    labels = eval_pred.label_ids
    acc = (preds == labels).mean()
    return {"accuracy": acc}

trainer = Trainer(
    model           = model,
    args            = ARGS,
    train_dataset   = ds_train,
    eval_dataset     = ds_val,
    data_collator   = collate,
    tokenizer       = tokenizer,
    callbacks       = [StepLossPrinter(), EmotionLossTracker()],
    compute_metrics=eval,
)

trainer.train(resume_from_checkpoint=last_chkpt)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,0.4528
2,0.4058
3,0.4091
4,0.4269
5,0.5843
6,0.4151
7,0.3649
8,0.3991
9,0.3921
10,0.4437


step      1 │ loss 0.4528 │ smoothed 0.4528
step      2 │ loss 0.4058 │ smoothed 0.4519
step      3 │ loss 0.4091 │ smoothed 0.4510
step      4 │ loss 0.4269 │ smoothed 0.4505
step      5 │ loss 0.5843 │ smoothed 0.4532
step      6 │ loss 0.4151 │ smoothed 0.4524
step      7 │ loss 0.3649 │ smoothed 0.4507
step      8 │ loss 0.3991 │ smoothed 0.4497
step      9 │ loss 0.3921 │ smoothed 0.4485
step     10 │ loss 0.4437 │ smoothed 0.4484
step     11 │ loss 0.3111 │ smoothed 0.4457
step     12 │ loss 0.4988 │ smoothed 0.4467
step     13 │ loss 0.4284 │ smoothed 0.4464
step     14 │ loss 0.4119 │ smoothed 0.4457
step     15 │ loss 0.4406 │ smoothed 0.4456
step     16 │ loss 0.3097 │ smoothed 0.4428
step     17 │ loss 0.3497 │ smoothed 0.4410
step     18 │ loss 0.3465 │ smoothed 0.4391
step     19 │ loss 0.4725 │ smoothed 0.4398
step     20 │ loss 0.3453 │ smoothed 0.4379
step     21 │ loss 0.3911 │ smoothed 0.4369
step     22 │ loss 0.3027 │ smoothed 0.4343
step     23 │ loss 0.3154 │ smoo




=== average loss by emotion ===


TrainOutput(global_step=65, training_loss=0.32102955648532283, metrics={'train_runtime': 665.3174, 'train_samples_per_second': 3.081, 'train_steps_per_second': 0.098, 'total_flos': 7.2173647822848e+16, 'train_loss': 0.32102955648532283, 'epoch': 5.0})

In [60]:
metrics = trainer.evaluate()   # runs on ds_val once, prints & returns dict
print(metrics)

{'eval_loss': 0.39822474122047424, 'eval_accuracy': 0.875594429347826, 'eval_runtime': 10.0426, 'eval_samples_per_second': 4.58, 'eval_steps_per_second': 0.597, 'epoch': 5.0}


In [61]:
!pip -q install gradio

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [None]:
import gradio as gr
import torch, transformers, peft, bitsandbytes
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

base_id    = "sarvamai/sarvam-m"
adapter_id = "sarvamai/sarvam-m-emo-qlora"

# 4-bit load config (same as training)
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit       = True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_use_double_quant = True,
)
tok = AutoTokenizer.from_pretrained(adapter_id, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    base_id,
    quantization_config=bnb_cfg,
    device_map="auto",
)
model = PeftModel.from_pretrained(model, adapter_id)  # attach QLoRA
model.eval()

def chat(inp, temperature, top_p):
    prompt = inp if inp.strip().startswith("Emotion:") else f"Emotion: <neutral>\nText: {inp}"
    ids = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **ids,
            max_new_tokens=64,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.1,
        )
    reply = tok.decode(out[0][ids.input_ids.size(1):], skip_special_tokens=True)
    return reply.strip()

demo = gr.Interface(
    fn=chat,
    inputs=[
        gr.Textbox(lines=4, label="Input (prepend Emotion: <angry>|<happy>|<sad>|<surprised> if desired)"),
        gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature"),
        gr.Slider(0.5, 1.0, 0.9, step=0.05, label="Top-p"),
    ],
    outputs="text",
    title="Sarvam-M Emotional QLoRA (Telugu)",
    description="Small demo – generation is 4-bit on a single GPU/CPU",
)

demo.launch(share=True)   # share=True gives a public link
