<a href="https://colab.research.google.com/github/weedge/doraemon-nb/blob/main/Orpheus_0_1_Finetune_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install

In [10]:
!pip install -q snac

# download

In [3]:
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

In [None]:
!huggingface-cli login --token $HF_TOKEN --add-to-git-credential


In [5]:
!huggingface-cli download hubertsiuzdak/snac_24khz --quie --local-dir /content/models/hubertsiuzdak/snac_24khz


/content/models/hubertsiuzdak/snac_24khz


In [9]:
!huggingface-cli download canopylabs/orpheus-3b-0.1-ft --quiet --include "*.json" --local-dir /content/models/canopylabs/orpheus-3b-0.1-ft
!huggingface-cli download canopylabs/orpheus-3b-0.1-ft --quiet --include "*.safetensors" --local-dir /content/models/canopylabs/orpheus-3b-0.1-ft


/content/models/canopylabs/orpheus-3b-0.1-ft
/content/models/canopylabs/orpheus-3b-0.1-ft


# inference

batch prompt to TTS

In [21]:
from snac import SNAC
import torch
import torch
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import numpy as np

snac_model = SNAC.from_pretrained("/content/models/hubertsiuzdak/snac_24khz")
snac_model = snac_model.to("cpu")

print("We have loaded the tokeniser/detokeniser model to the cpu, to use vram - use the gpu for faster inference")

from huggingface_hub import snapshot_download
model_dir = "/content/models/canopylabs/orpheus-3b-0.1-ft"

model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.bfloat16)
model.cuda()
tokenizer = AutoTokenizer.from_pretrained(model_dir)


We have loaded the tokeniser/detokeniser model to the cpu, to use vram - use the gpu for faster inference


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [30]:
#### CHANGE THIS ####

raw_prompts = [
    "Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.",
    "I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!",
    "I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.",
]

chosen_voice = "tara" # see github for other voices

print("*** See our github for tips on prompting the model for cleaning, humanlike generations.")

*** See our github for tips on prompting the model for cleaning, humanlike generations.


# Format prompts into correct template

In [43]:
tokenizer.decode([128255,128256,128257,128258,128259,128260,128263,128008,128009])

'<|reserved_special_token_247|><custom_token_0><custom_token_1><custom_token_2><custom_token_3><custom_token_4><custom_token_7><|eom_id|><|eot_id|>'

In [33]:
prompts = [f"{chosen_voice}: " + p for p in raw_prompts]
print(prompts)

all_input_ids = []

for prompt in prompts:
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
  all_input_ids.append(input_ids)

start_token = torch.tensor([[128259]], dtype=torch.int64) # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human

all_modified_input_ids = []
for input_ids in all_input_ids:
  modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # SOH SOT Text EOT EOH
  all_modified_input_ids.append(modified_input_ids)

all_padded_tensors = []
all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
# align each prompt
for modified_input_ids in all_modified_input_ids:
  padding = max_length - modified_input_ids.shape[1]
  padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
  attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
  all_padded_tensors.append(padded_tensor)
  all_attention_masks.append(attention_mask)

print(all_padded_tensors)
print(all_attention_masks)

all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
all_attention_masks = torch.cat(all_attention_masks, dim=0)

input_ids = all_padded_tensors.to("cuda")

for i in range(input_ids.shape[0]):
  print(tokenizer.decode(input_ids[i]))

attention_mask = all_attention_masks.to("cuda")

["tara: Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.", "tara: I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!", "tara: I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters."]
[tensor([[128263, 128263, 128263, 128263, 128263, 128263, 128263, 128259, 128000,
             83,   5169,     25,  28653,   1070,    856,    836,    374,  70797,
             11,    366,    331,  57075,     29,    323,    358,   2846,    264,
           8982,   9659,   1646,    430,    649,   5222,   1093,    264,   1732,
             13, 128009, 128260]]), tensor([[128263, 128263, 128263, 128263, 128263, 128263, 128259, 128000,     83,
           5169,     25,    358,   3077,   1101,   1027,  15972,    311,   3619,
            323,   8356,   1370,   6260,     84,   4633,   2574,   1093,  31238,
            287,     11,    477,  43560,  

In [34]:
#@title Generate Output
print("*** Model.generate is slow - see vllm implementation on github for realtime streaming and inference")
print("*** Increase/decrease inference params for more expressive less stable generations")

with torch.no_grad():
  generated_ids = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=1200,
      do_sample=True,
      temperature=0.6,
      top_p=0.95,
      repetition_penalty=1.1,
      num_return_sequences=1,
      eos_token_id=128258,
  )

Setting `pad_token_id` to `eos_token_id`:128258 for open-end generation.


*** Model.generate is slow - see vllm implementation on github for realtime streaming and inference
*** Increase/decrease inference params for more expressive less stable generations


In [36]:
print(generated_ids)

tensor([[128263, 128263, 128263,  ..., 128258, 128258, 128258],
        [128263, 128263, 128263,  ..., 128258, 128258, 128258],
        [128259, 128000,     83,  ..., 149167, 154670, 128258]],
       device='cuda:0')


In [37]:
for i in range(generated_ids.shape[0]):
  print(tokenizer.decode(generated_ids[i]))


<custom_token_7><custom_token_7><custom_token_7><custom_token_7><custom_token_7><custom_token_7><custom_token_7><custom_token_3><|begin_of_text|>tara: Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.<|eot_id|><custom_token_4><custom_token_5><custom_token_1><custom_token_958><custom_token_6094><custom_token_9937><custom_token_14362><custom_token_18382><custom_token_20888><custom_token_24631><custom_token_987><custom_token_7291><custom_token_11128><custom_token_14033><custom_token_16496><custom_token_23208><custom_token_25889><custom_token_2622><custom_token_4149><custom_token_11084><custom_token_12702><custom_token_19894><custom_token_23602><custom_token_28662><custom_token_2788><custom_token_5881><custom_token_11774><custom_token_13729><custom_token_18169><custom_token_24500><custom_token_24939><custom_token_1345><custom_token_7430><custom_token_11817><custom_token_15410><custom_token_19718><custom_token_24182><custom_token_27255><cus

## title Parse Output as speech


In [56]:
token_to_find = 128257
token_to_remove = 128258
print(tokenizer.decode(token_to_find))
print(tokenizer.decode(token_to_remove))

token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
print(token_indices)

<custom_token_1>
<custom_token_2>
(tensor([0, 1, 2], device='cuda:0'), tensor([40, 40, 40], device='cuda:0'))


In [54]:
if len(token_indices[1]) > 0:
    last_occurrence_idx = token_indices[1][-1].item()
    cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
    cropped_tensor = generated_ids

print(cropped_tensor)

mask = cropped_tensor != token_to_remove
print(mask)


tensor([[129214, 134350, 138193,  ..., 128258, 128258, 128258],
        [129243, 135043, 138853,  ..., 128258, 128258, 128258],
        [131044, 134350, 138033,  ..., 149167, 154670, 128258]],
       device='cuda:0')
tensor([[ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ...,  True,  True, False]], device='cuda:0')


In [49]:
processed_rows = []

for row in cropped_tensor:
    masked_row = row[row != token_to_remove]
    print(masked_row.shape)
    processed_rows.append(masked_row)



torch.Size([539])
torch.Size([595])
torch.Size([1050])


In [52]:
code_lists = []

for row in processed_rows:
    row_length = row.size(0)
    new_length = (row_length // 7) * 7
    trimmed_row = row[:new_length]
    trimmed_row = [t - 128266 for t in trimmed_row]
    print(len(trimmed_row))
    code_lists.append(trimmed_row)


539
595
1050


In [58]:
def redistribute_codes(code_list):
  layer_1 = []
  layer_2 = []
  layer_3 = []
  for i in range((len(code_list)+1)//7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1]-4096)
    layer_3.append(code_list[7*i+2]-(2*4096))
    layer_3.append(code_list[7*i+3]-(3*4096))
    layer_2.append(code_list[7*i+4]-(4*4096))
    layer_3.append(code_list[7*i+5]-(5*4096))
    layer_3.append(code_list[7*i+6]-(6*4096))
  codes = [torch.tensor(layer_1).unsqueeze(0),
         torch.tensor(layer_2).unsqueeze(0),
         torch.tensor(layer_3).unsqueeze(0)]
  audio_hat = snac_model.decode(codes)
  return audio_hat

my_samples = []
for code_list in code_lists:
  samples = redistribute_codes(code_list)
  print(samples.shape)
  my_samples.append(samples)


torch.Size([1, 1, 157696])
torch.Size([1, 1, 174080])
torch.Size([1, 1, 307200])


## title Display Audio


In [60]:
from IPython.display import display, Audio
if len(prompts) != len(my_samples):
  raise Exception("Number of prompts and samples do not match")
else:
  for i in range(len(my_samples)):
    print(prompts[i])
    samples = my_samples[i]
    display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))


tara: Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.


tara: I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!


tara: I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.


# Streaming Inference with vllm and orpheus-speech lib

- https://github.com/canopyai/Orpheus-Speech-PyPi

In [None]:
!pip install -q orpheus-speech

In [2]:
!pip install -q vllm==0.7.3

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.5/96.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.5/906.5 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m117.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.9/396.9 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install -q -U snac

In [1]:
# A100
from orpheus_tts import OrpheusModel
import wave
import time
import torch

model_dir = "canopylabs/orpheus-3b-0.1-ft"
model = OrpheusModel(model_name = model_dir,dtype=torch.bfloat16)
prompt = '''Man, the way social media has, um, completely changed how we interact is just wild, right? Like, we're all connected 24/7 but somehow people feel more alone than ever. And don't even get me started on how it's messing with kids' self-esteem and mental health and whatnot.'''

start_time = time.monotonic()
syn_tokens = model.generate_speech(
   prompt=prompt,
   voice="tara",
   )

with wave.open("output.wav", "wb") as wf:
   wf.setnchannels(1)
   wf.setsampwidth(2)
   wf.setframerate(24000)

   total_frames = 0
   chunk_counter = 0
   for audio_chunk in syn_tokens: # output streaming
      chunk_counter += 1
      frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels())
      total_frames += frame_count
      wf.writeframes(audio_chunk)
   duration = total_frames / wf.getframerate()

   end_time = time.monotonic()
   print(f"It took {end_time - start_time} seconds to generate {duration:.2f} seconds of audio")

  state_dict = torch.load(model_path, map_location="cpu")


INFO 03-22 08:28:34 __init__.py:207] Automatically detected platform cuda.


config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

INFO 03-22 08:28:36 config.py:2444] Downcasting torch.float32 to torch.bfloat16.
INFO 03-22 08:28:48 config.py:549] This model supports multiple tasks: {'classify', 'embed', 'score', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 03-22 08:28:48 config.py:1555] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 03-22 08:28:48 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='canopylabs/orpheus-3b-0.1-ft', speculative_config=None, tokenizer='canopylabs/orpheus-3b-0.1-ft', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observabili

tokenizer_config.json:   0%|          | 0.00/5.41M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

INFO 03-22 08:29:04 cuda.py:229] Using Flash Attention backend.
INFO 03-22 08:29:04 model_runner.py:1110] Starting to load model canopylabs/orpheus-3b-0.1-ft...
INFO 03-22 08:29:05 weight_utils.py:254] Using model weights format ['*.safetensors']


model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.32G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

INFO 03-22 08:29:44 weight_utils.py:270] Time spent downloading weights for canopylabs/orpheus-3b-0.1-ft: 39.409467 seconds


model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 03-22 08:29:48 model_runner.py:1115] Loading model weights took 6.1801 GB
INFO 03-22 08:29:50 worker.py:267] Memory profiling takes 0.75 seconds
INFO 03-22 08:29:50 worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.90) = 35.60GiB
INFO 03-22 08:29:50 worker.py:267] model weights take 6.18GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 27.89GiB.
INFO 03-22 08:29:50 executor_base.py:111] # cuda blocks: 16317, # CPU blocks: 2340
INFO 03-22 08:29:50 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 1.99x
INFO 03-22 08:29:54 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:27<00:00,  1.29it/s]

INFO 03-22 08:30:21 model_runner.py:1562] Graph capturing finished in 27 secs, took 0.21 GiB
INFO 03-22 08:30:21 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 32.33 seconds





Man, the way social media has, um, completely changed how we interact is just wild, right? Like, we're all connected 24/7 but somehow people feel more alone than ever. And don't even get me started on how it's messing with kids' self-esteem and mental health and whatnot.
INFO 03-22 08:30:31 async_llm_engine.py:211] Added request req-001.
INFO 03-22 08:30:31 metrics.py:455] Avg prompt throughput: 7.2 tokens/s, Avg generation throughput: 0.1 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
INFO 03-22 08:30:36 metrics.py:455] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 101.2 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.2%, CPU KV cache usage: 0.0%.
INFO 03-22 08:30:41 metrics.py:455] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 99.9 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.4%, CPU KV cache usage: 0.0%.
INFO 0

In [2]:
from IPython.display import display, Audio
display(Audio("output.wav", autoplay=True))

# streaming inference with transformers