In [None]:
# Uncomment the next line if you prefer to silence pip’s output
# %%capture  

import importlib
import subprocess
import sys

def ensure_installed(pkg_spec: str):
    """
    Import a package if it’s already available; otherwise install it,
    so the current kernel can immediately import it.
    """
    # Strip extras like >= / == so we can import the bare name
    pkg_name = pkg_spec.split("==")[0].split(">=")[0].split("<")[0]
    try:
        importlib.import_module(pkg_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_spec])

# ── Core libraries ────────────────────────────────────────────────────────────
for package in [
    "unsloth",     # main library
    "snac",        # your extra requirement

    # ─ Optional / helper deps used by Unsloth examples  ─
    # Comment out anything you don’t need or already have
    "bitsandbytes",
    "accelerate",
    "xformers==0.0.29.post3",
    "peft",
    "trl",
    "triton",
    "cut_cross_entropy",
    "unsloth_zoo",
    "sentencepiece",
    "protobuf",
    "datasets>=3.4.1,<4.0.0",
    "huggingface_hub>=0.34.0",
    "hf_transfer",
]:
    ensure_installed(package)

print("✅ All required libraries are installed and ready!")


In [None]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "canopylabs/3b-hi-ft-research_release",
    # "/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac/model_3000_steps_4096_vllm",#"canopylabs/3b-hi-ft-research_release",
    max_seq_length= 4096, # Choose any for long context!
    dtype = None, # Select None for auto detection
    load_in_4bit = True, # Select True for 4bit which reduces memory usage
    full_finetuning=True
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
# you already have: model, tokenizer
print("special_tokens_map:", tokenizer.special_tokens_map)         # dict of special roles → tokens
print("all_special_tokens:", tokenizer.all_special_tokens)         # list of every special token
print("all_special_ids:", tokenizer.all_special_ids)               # their IDs

print("additional_special_tokens:", tokenizer.additional_special_tokens)
print("additional_special_tokens_ids:", tokenizer.additional_special_tokens_ids)

# What did we add after load()
print("added_vocab_size:", len(tokenizer.get_added_vocab()))
print("added_vocab_sample:", list(tokenizer.get_added_vocab().items())[:20])


In [None]:
TAGS = ["<pause>", "<breath>", "<sigh>", "<laugh>", "<gasp>", "<chuckle>", "<hmm..>"]
print({t: tokenizer.convert_tokens_to_ids(t) for t in TAGS})
# any -100 / unk? then they’re NOT registered as special tokens yet
print("unk_token_id:", tokenizer.unk_token_id)


In [None]:
print("pad:", tokenizer.pad_token, tokenizer.pad_token_id)
print("bos:", tokenizer.bos_token, tokenizer.bos_token_id)
print("eos:", tokenizer.eos_token, tokenizer.eos_token_id)

In [None]:
EMOTION_TAGS = ["<pause>", "<breath>", "<sigh>", "<laugh>", "<gasp>", "<hmm..>"] 
missing = [t for t in EMOTION_TAGS if tokenizer.convert_tokens_to_ids(t) is None]
if missing:
    tokenizer.add_special_tokens({"additional_special_tokens": missing})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    modules_to_save=["lm_head", "embed_tokens"],
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import os
from datasets import load_dataset, Audio

# Set CWD to the data_dir
os.chdir("/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac")

dataset = load_dataset("csv", data_files="metadata_with_tags.csv", split="train")
dataset = dataset.cast_column("PATH", Audio())
print(dataset[0]["PATH"])  # Now will work since CWD == data_dir
print(dataset)

In [None]:
import os
data_dir = "/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac"
print(os.path.isfile(os.path.join(data_dir, "metadata_with_tags.csv")))             # Should be True
print(os.path.isdir(os.path.join(data_dir, "SPEECHRIV")))                              # Should be True
print(os.path.isfile(os.path.join(data_dir, "SPEECHRIV/1_AGENT.wav")))   

In [None]:
import os
from datasets import load_dataset, Audio
import locale
import torchaudio.transforms as T
import torch
from snac import SNAC

# Dataset loading
os.chdir("/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac")
dataset = load_dataset("csv", data_files="metadata_with_tags.csv", split="train")
dataset = dataset.cast_column("PATH", Audio())
print(f"Dataset loaded: {len(dataset)} samples")
print(f"First sample keys: {dataset[0].keys()}")

# SNAC model setup
locale.getpreferredencoding = lambda: "UTF-8"
ds_sample_rate = dataset[0]["PATH"]["sampling_rate"]
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to("cuda")

# APPROACH 1: Original Orpheus approach with hardcoded offsets
# This is what the original code uses
def tokenise_audio_original_approach(waveform):
    """Original approach with hardcoded offsets"""
    waveform = torch.from_numpy(waveform).unsqueeze(0).to(dtype=torch.float32)
    resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
    waveform = resample_transform(waveform)
    waveform = waveform.unsqueeze(0).to("cuda")
    
    with torch.inference_mode():
        codes = snac_model.encode(waveform)
    
    # Original offset scheme - each codebook gets its own range
    all_codes = []
    for i in range(codes[0].shape[1]):
        all_codes.append(codes[0][0][i].item() + 128266)           # Codebook 0
        all_codes.append(codes[1][0][2*i].item() + 128266 + 4096)   # Codebook 1, first
        all_codes.append(codes[2][0][4*i].item() + 128266 + (2*4096))     # Codebook 2, first
        all_codes.append(codes[2][0][(4*i)+1].item() + 128266 + (3*4096)) # Codebook 2, second
        all_codes.append(codes[1][0][(2*i)+1].item() + 128266 + (4*4096)) # Codebook 1, second
        all_codes.append(codes[2][0][(4*i)+2].item() + 128266 + (5*4096)) # Codebook 2, third
        all_codes.append(codes[2][0][(4*i)+3].item() + 128266 + (6*4096)) # Codebook 2, fourth
    
    return all_codes

# APPROACH 2: Dynamic tokenizer approach (your attempted modification)
def tokenise_audio_dynamic_approach(waveform):
    """Your approach - raw codes without offsets"""
    waveform = torch.from_numpy(waveform).unsqueeze(0).to(dtype=torch.float32)
    resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
    waveform = resample_transform(waveform)
    waveform = waveform.unsqueeze(0).to("cuda")
    
    with torch.inference_mode():
        codes = snac_model.encode(waveform)
    
    # Just raw codes, no offsets
    all_codes = []
    for i in range(codes[0].shape[1]):
        all_codes.append(codes[0][0][i].item())
        all_codes.append(codes[1][0][2*i].item())
        all_codes.append(codes[2][0][4*i].item())
        all_codes.append(codes[2][0][(4*i)+1].item())
        all_codes.append(codes[1][0][(2*i)+1].item())
        all_codes.append(codes[2][0][(4*i)+2].item())
        all_codes.append(codes[2][0][(4*i)+3].item())
    
    return all_codes

# CHOOSE YOUR APPROACH HERE
USE_ORIGINAL_APPROACH = True  # Set to False to use dynamic approach

if USE_ORIGINAL_APPROACH:
    print("Using ORIGINAL Orpheus approach with hardcoded offsets")
    tokenise_audio = tokenise_audio_original_approach
    
    # For original approach, we need to ensure tokenizer vocabulary is large enough
    # The max token ID will be around 128266 + 7*4096 = 156938
    MAX_TOKEN_ID = 128266 + (7 * 4096)
    
    # Resize model embeddings if needed
    if len(tokenizer) < MAX_TOKEN_ID:
        print(f"Resizing tokenizer from {len(tokenizer)} to {MAX_TOKEN_ID}")
        # Add dummy tokens to reach the required size
        num_to_add = MAX_TOKEN_ID - len(tokenizer)
        dummy_tokens = [f"<dummy_{i}>" for i in range(num_to_add)]
        tokenizer.add_tokens(dummy_tokens)
        model.resize_token_embeddings(len(tokenizer))
    
    # No need to set AUDIO_TOKENS_START for original approach
    
else:
    print("Using DYNAMIC approach with tokenizer-added audio tokens")
    tokenise_audio = tokenise_audio_dynamic_approach
    
    # For dynamic approach, we need to add audio tokens to tokenizer
    # First, tokenize one sample to find the max code value
    test_audio = dataset[0]["PATH"]["array"]
    test_codes = tokenise_audio(test_audio)
    max_code = max(test_codes)
    
    # SNAC typically has 4096 codes per codebook, 3 codebooks
    # But let's calculate from data to be sure
    print(f"Max code value found: {max_code}")
    num_audio_codes = max_code + 1
    
    # Add audio tokens to tokenizer
    audio_tokens = [f"<audio_{i}>" for i in range(num_audio_codes)]
    tokenizer.add_tokens(audio_tokens)
    model.resize_token_embeddings(len(tokenizer))
    AUDIO_TOKENS_START = len(tokenizer) - num_audio_codes
    print(f"AUDIO_TOKENS_START: {AUDIO_TOKENS_START}")

# Process dataset
def add_codes(example):
    codes_list = None
    try:
        answer_audio = example.get("PATH")
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
    example["codes_list"] = codes_list
    return example

dataset = dataset.map(add_codes, remove_columns=["PATH"])
dataset = dataset.filter(lambda x: x["codes_list"] is not None)
dataset = dataset.filter(lambda x: len(x["codes_list"]) > 0)

# Remove duplicate frames
def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")
    result = vals[:7]
    for i in range(7, len(vals), 7):
        if vals[i] != result[-7]:
            result.extend(vals[i:i+7])
    example["codes_list"] = result
    return example

dataset = dataset.map(remove_duplicate_frames)

# Define special tokens
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = tokeniser_length + 7

# Set tokenizer pad token
tokenizer.pad_token_id = pad_token
tokenizer.padding_side = "right"

# Create input IDs
def create_input_ids(example):
    # Use the correct field name for text
    text_field = "TEXT" if "TEXT" in example else "text"
    text_prompt = example[text_field]
    
    # Handle multi-speaker if source field exists
    if "source" in example:
        text_prompt = f"{example['source']}: {text_prompt}"
    
    text_ids = tokenizer.encode(text_prompt, add_special_tokens=True)
    text_ids.append(end_of_text)
    
    # Handle audio codes based on approach
    if USE_ORIGINAL_APPROACH:
        # Original: codes are already offset in tokenise_audio
        audio_token_ids = example["codes_list"]
    else:
        # Dynamic: add offset here
        audio_token_ids = [AUDIO_TOKENS_START + code for code in example["codes_list"]]
    
    input_ids = (
        [start_of_human]
        + text_ids
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + audio_token_ids
        + [end_of_speech]
        + [end_of_ai]
    )
    # max_len = 2048
    # if len(input_ids) > max_len:
    #     input_ids = input_ids[:max_len]
    example["input_ids"] = input_ids
    example["labels"] = input_ids.copy()
    example["attention_mask"] = [1] * len(input_ids)
    return example

# Map and clean dataset
text_column = "TEXT" if "TEXT" in dataset.column_names else "text"
remove_cols = [text_column, "codes_list"]
if "source" in dataset.column_names:
    remove_cols.append("source")

dataset = dataset.map(create_input_ids, remove_columns=remove_cols)

# Keep only necessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in dataset.column_names if col not in columns_to_keep]
dataset = dataset.remove_columns(columns_to_remove)

print(f"Final dataset size: {len(dataset)}")
print(f"Sample input_ids length: {len(dataset[0]['input_ids'])}")
print(f"Max token ID in dataset: {max(max(sample['input_ids']) for sample in dataset)}")

# Verify token IDs are within vocabulary
max_token_in_data = max(max(sample['input_ids']) for sample in dataset)
if max_token_in_data >= len(tokenizer):
    print(f"ERROR: Max token ID {max_token_in_data} >= vocab size {len(tokenizer)}")
    print("Need to resize tokenizer/model embeddings!")
else:
    print(f"✓ All token IDs within vocabulary (max: {max_token_in_data}, vocab: {len(tokenizer)})")

In [None]:
TAGS = ["<pause>","<breath>","<sigh>", "<hmm..>", "<laugh>"]
print({t: tokenizer.convert_tokens_to_ids(t) for t in TAGS})

In [None]:
print(dataset[0]["input_ids"])
print(dataset[0]["labels"])
print(dataset[0]["attention_mask"])

In [None]:
model.resize_token_embeddings(len(tokenizer))
# Optionally, also cast new embeddings
model.get_input_embeddings().weight.data = model.get_input_embeddings().weight.data.to(model.dtype)


In [None]:
# Training setup
import os
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from transformers import DataCollatorForLanguageModeling
# Disable Unsloth's problematic optimization
# os.environ["UNSLOTH_USE_FUSED_CROSS_ENTROPY"] = "0"


model = model.to(dtype=torch.bfloat16)
# Create trainer
trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run.
        max_steps = 4000,
        learning_rate = 1.5e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs_700",
        report_to = "none", # Use this for WandB etc
        fp16=False, 
        bf16=True
    ),
    # data_collator=data_collator,
)


In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
prompts = [
    # "ठीक है, मैं कल आपके लिए demo arrange कर दूँगा क्या आपके पास और सवाल हैं?",
    # "मुझे quick demo चाहिए, <breath> बस 2 minute का, right now." 
#     ''' 
# बिल्कुल, पहले ऐप खोलें और 'Add New Beneficiary' चुनें। बच्चे का नाम, उम्र (महीनों में) <hmm..>, 


# '''
# "हाँ, <hmm..> मैंने पहली बार आपके product के बारे में एक webinar में सुना था मैं समझना चाहता हूँ कि यह हमारी existing systems के साथ कैसे integrate कर सकता है " 
# "ज़रूर <happy> हमारी pricing flexible plans पर आधारित है, <hmm..> ताकि आपके business needs के अनुसार best fit हो सके "
"Delhi की एक retail chain ने हमारे solutions से अपनी sales में 30% तक वृद्धि देखी है। <breath> उनका feedback बहुत encouraging रहा है ।"
# "ये तो बहुत flexible है। <hmm..> क्या मैं इसमें social media links भी जोड़ सकता हूँ?"
]

chosen_voice = None # None for single-speaker

In [None]:
#@title Run Inference


FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Moving snac_model cuda to cpu
snac_model.to("cpu")

prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]

all_input_ids = []

for prompt in prompts_:
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
  all_input_ids.append(input_ids)

start_token = torch.tensor([[ 128259]], dtype=torch.int64) # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human

all_modified_input_ids = []
for input_ids in all_input_ids:
  modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # SOH SOT Text EOT EOH
  all_modified_input_ids.append(modified_input_ids)

all_padded_tensors = []
all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
for modified_input_ids in all_modified_input_ids:
  padding = max_length - modified_input_ids.shape[1]
  padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
  attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
  all_padded_tensors.append(padded_tensor)
  all_attention_masks.append(attention_mask)

all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
all_attention_masks = torch.cat(all_attention_masks, dim=0)

input_ids = all_padded_tensors.to("cuda")
attention_mask = all_attention_masks.to("cuda")
generated_ids = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=2000,
      do_sample=True,
      temperature=0.4,
      top_p=0.9,
      repetition_penalty=1.1,
      num_return_sequences=1,
      eos_token_id=128258,
     use_cache = True
  )
token_to_find = 128257
token_to_remove = 128258

token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)

if len(token_indices[1]) > 0:
    last_occurrence_idx = token_indices[1][-1].item()
    cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
    cropped_tensor = generated_ids

mask = cropped_tensor != token_to_remove

processed_rows = []

for row in cropped_tensor:
    masked_row = row[row != token_to_remove]
    processed_rows.append(masked_row)

code_lists = []

for row in processed_rows:
    row_length = row.size(0)
    new_length = (row_length // 7) * 7
    trimmed_row = row[:new_length]
    trimmed_row = [t - 128266 for t in trimmed_row]
    code_lists.append(trimmed_row)


def redistribute_codes(code_list):
  layer_1 = []
  layer_2 = []
  layer_3 = []
  for i in range((len(code_list)+1)//7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1]-4096)
    layer_3.append(code_list[7*i+2]-(2*4096))
    layer_3.append(code_list[7*i+3]-(3*4096))
    layer_2.append(code_list[7*i+4]-(4*4096))
    layer_3.append(code_list[7*i+5]-(5*4096))
    layer_3.append(code_list[7*i+6]-(6*4096))
  codes = [torch.tensor(layer_1).unsqueeze(0),
         torch.tensor(layer_2).unsqueeze(0),
         torch.tensor(layer_3).unsqueeze(0)]

  # codes = [c.to("cuda") for c in codes]
  audio_hat = snac_model.decode(codes)
  return audio_hat

my_samples = []
for code_list in code_lists:
  samples = redistribute_codes(code_list)
  my_samples.append(samples)
from IPython.display import display, Audio
if len(prompts) != len(my_samples):
  raise Exception("Number of prompts and samples do not match")
else:
  for i in range(len(my_samples)):
    print(prompts[i])
    samples = my_samples[i]
    display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))
# Clean up to save RAM
del my_samples,samples

In [None]:
from transformers import AutoTokenizer

# Assuming `tokenizer` is your extended tokenizer
tokenizer.save_pretrained("/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac/outputs_700/checkpoint-4000")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac/outputs_700/checkpoint-4000", torch_dtype="bfloat16", device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained("/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac/outputs_700/checkpoint-4000")
model.save_pretrained("model_4000_fft", safe_serialization=True)
tokenizer.save_pretrained("model_4000_fft")


In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
# if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
# if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
# if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False:
    model.save_pretrained("model")
    tokenizer.save_pretrained("model")
# if False:
#     model.push_to_hub("hf/model", token = "")
#     tokenizer.push_to_hub("hf/model", token = "")

In [None]:
# model.save_pretrained_merged("model_5000", tokenizer, save_method="merged_16bit") #does not work as its full fine tuning 


In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
# from safetensors import safe_open
# import json, os

# SRC = "/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac/model_5000_steps_4096"          # your folder (screenshot)
# DST = "./model_5000_steps_4096_vllm"     # output fixed folder

# # 1) Find the *actual* rows in the checkpoint’s embed matrix
# idx = json.load(open(os.path.join(SRC, "model.safetensors.index.json")))
# embed_shard = idx["weight_map"]["model.embed_tokens.weight"]
# with safe_open(os.path.join(SRC, embed_shard), framework="pt") as f:
#     OLD_ROWS = f.get_tensor("model.embed_tokens.weight").shape[0]   # -> 156940

# # 2) Required vocab from tokenizer (includes your new tags)
# tok = AutoTokenizer.from_pretrained(SRC)
# REQUIRED = max(tok.get_vocab().values()) + 1                        # -> 156946

# # 3) Load with a temp config that MATCHES the checkpoint (so weights load)
# cfg = AutoConfig.from_pretrained(SRC)
# cfg.vocab_size = OLD_ROWS
# model = AutoModelForCausalLM.from_pretrained(
#     SRC, config=cfg, device_map="cpu", low_cpu_mem_usage=True
# )

# # 4) Now grow embeddings to cover the new tags (preserves old rows)
# if model.get_input_embeddings().weight.shape[0] != REQUIRED:
#     model.resize_token_embeddings(REQUIRED)
# model.config.vocab_size = REQUIRED

# # 5) Keep pad/eos sane
# model.config.pad_token_id = tok.pad_token_id
# try:
#     model.generation_config.pad_token_id = tok.pad_token_id
#     model.generation_config.eos_token_id = tok.eos_token_id
# except Exception:
#     pass

# # 6) Save a vLLM-ready export
# os.makedirs(DST, exist_ok=True)
# model.save_pretrained(DST, safe_serialization=True)
# tok.save_pretrained(DST)

# # 7) Verify
# check = AutoModelForCausalLM.from_pretrained(DST, device_map="cpu")
# rows = check.get_input_embeddings().weight.shape[0]
# print("rows=", rows, "vocab_size=", check.config.vocab_size)  # both must be 156946


In [None]:
from orpheus_tts import OrpheusModel
import wave
import time
import os 
from IPython.display import Audio
# Use the path to your local folder (relative or absolute)
model = OrpheusModel(model_name="/home/user/voice/Orpheus-TTS/finetune/hf_cache/datasets--telecmiusa--tts-hi-data/snapshots/d564239b4542d4e25ee213660bf0104e700858ac/model_4000_fft")

In [None]:
# prompt = "हाँ, <hmm..> मैंने पहली बार आपके product के बारे में एक webi-nar में सुना था मैं समझना चाहता हूँ कि यह हमारी existing systems के साथ कैसे integrate कर सकता है " 
prompt ="Delhi की एक retail chain ने हमारे solutions से अपनी sales में 30% तक वृद्धि देखी है। <hmm..> उनका feedback बहुत encouraging रहा है ।"
 
# prompt = '''Absolutely <happy> हमारे system में team performance के लिए भी कई analytics tools मौजूद हैं <pause> आप individual 
# और team performance both track कर सकते हैं और rewards और training strategies accordingly plan कर सकते हैं <hmm..> 
# क्या आप training और support options के बारे में कुछ जानना चाहेंगे'''
# prompt = prompt + " " + "<chuckle"


filename = "prompt_8.wav"
start_time = time.monotonic()
syn_tokens = model.generate_speech(
   prompt=prompt,
   voice=None,
   )

with wave.open(filename, "wb") as wf:
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(24000)

    total_frames = 0
    chunk_counter = 0
    for audio_chunk in syn_tokens:  # output streaming
        chunk_counter += 1
        frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels())
        total_frames += frame_count
        wf.writeframes(audio_chunk)
    duration = total_frames / wf.getframerate()

end_time = time.monotonic()
print(f"It took {end_time - start_time} seconds to generate {duration:.2f} seconds of audio")
Audio(filename)
