# Load Model

In [None]:
%cd src/metavoice-src-main

import os

import shutil
import tempfile
import time
from pathlib import Path

import librosa
import torch
from huggingface_hub import snapshot_download

from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
from fam.llm.decoders import EncodecDecoder
from fam.llm.fast_inference_utils import build_model, main
from fam.llm.inference import (
    EncodecDecoder,
    InferenceConfig,
    Model,
    TiltedEncodec,
    TrainedBPETokeniser,
    get_cached_embedding,
    get_cached_file,    
    get_enhancer,
)
from fam.llm.utils import (
    check_audio_file,
    get_default_dtype,
    get_device,
    normalize_text,
)

model_name = "metavoiceio/metavoice-1B-v0.1"
seed = 1337
output_dir = "outputs"
_dtype = get_default_dtype()
_device = 'cuda:0'
_model_dir = snapshot_download(repo_id=model_name)
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)

second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
config_second_stage = InferenceConfig(
    ckpt_path=second_stage_ckpt_path,
    num_samples=1,
    seed=seed,
    device=_device,
    dtype=_dtype,
    compile=False,
    init_from="resume",
    output_dir=output_dir,
)
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
llm_second_stage = Model(
    config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
)
enhancer = get_enhancer("df")

precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
model, tokenizer, smodel, model_size = build_model(
    precision=precision,
    checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
    spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
    device=_device,
    compile=True,
    compile_prefill=True,
)

# Obtain emotion direction

Option 1: provide your own audio files as (neutral, empathetic) pairs

In [None]:
#Put in your own audio files as (neutral, empathetic) pairs
audio_pairs = [
    ('/proj/afosr/metavoice/misc_audio_files/neutral_oprah.wav', '/proj/afosr/metavoice/misc_audio_files/oprah_empathetic_concatenated.wav'),
    ('/proj/afosr/metavoice/misc_audio_files/neutral_vt2NjqXKzyA.wav', '/proj/afosr/metavoice/misc_audio_files/vt2NjqXKzyA_empathetic_concatenated.wav')
]

source_speaker_audio_path = #plug in audio file for voice cloning
source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)

speaker_pair_embs = [
    (get_cached_embedding(neutral_audio_path, smodel).to(device=_device, dtype=precision), 
    get_cached_embedding(emotional_audio_path, smodel).to(device=_device, dtype=precision)) for neutral_audio_path, emotional_audio_path in audio_pairs
]

emo_dirs = [emotional_emb - neutral_emb for neutral_emb, emotional_emb in speaker_pair_embs]
emo_dirs = [emo_dir / torch.linalg.norm(emo_dir, dim=-1, keepdim=True) for emo_dir in emo_dirs]

emo_dir = sum(emo_dirs) / len(emo_dirs)
emo_dir = emo_dir / torch.linalg.norm(emo_dir, dim=-1, keepdim=True)

Option 2: use our pre-computed emotion directions

In [2]:
import pickle

emo_dirs = pickle.load(open('../all_emo_dirs.pkl', 'rb'))
print(f"available emotions: {emo_dirs.keys()}")
emo_dir = emo_dirs['happy']

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/proj/afosr/metavoice/emoknob/src/metavoice-src-main
available emotions: dict_keys(['charisma', 'empathetic', 'angry', 'contempt', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'desire', 'doubt', 'empathic pain', 'envy', 'joy', 'neutral', 'romance', 'sarcasm', 'tiredness', 'triump'])


# Generate samples

In [None]:
strength = 0.3 #set strength of emotion control
edited_emb = source_emb + strength * torch.tensor(emo_dir, device=_device, dtype=precision)


top_p=0.95
guidance_scale=3.0#3.0
temperature=1.0
text = normalize_text(text)

start = time.time()
# first stage LLM
tokens = main(
    model=model,
    tokenizer=tokenizer,
    model_size=model_size,
    prompt=text,
    spk_emb=edited_emb,
    top_p=torch.tensor(top_p, device=_device, dtype=precision),
    guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision),
    temperature=torch.tensor(temperature, device=_device, dtype=precision),
)
text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])

b_speaker_embs = edited_emb.unsqueeze(0)

# second stage LLM + multi-band diffusion model
wav_files = llm_second_stage(
    texts=[text],
    encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
    speaker_embs=b_speaker_embs,
    batch_size=1,
    guidance_scale=None,
    top_p=None,
    top_k=200,
    temperature=1.0,
    max_new_tokens=None,
)

wav_file = wav_files[0]
with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
    enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
    shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
    print(f"\nSaved audio to {wav_file}.wav")

output_path = str(wav_file) + ".wav"


# Display the generated audio
from IPython.display import Audio, display

display(Audio(output_path))
