### Before running further you will need ClearSpeak from speech-rule-engine (node package)
To install it localy run the following from `tortois-tts` dir: 
```
npm init -y
npm install speech-rule-engine
```

In [1]:
%env CUDA_VISIBLE_DEVICES=0
%env HF_HOME=/mnt/LLM
%env OMP_NUM_THREADS=16

import os
import torch
import torchaudio
import time
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.text import split_and_recombine_text
from tortoise.utils.audio import load_audio, load_voice, load_voices

import re, latex2mathml.converter, subprocess
import numpy as np

import IPython.display as ipd

from pathlib import Path
import json

env: CUDA_VISIBLE_DEVICES=0
env: HF_HOME=/mnt/LLM
env: OMP_NUM_THREADS=16


  from .autonotebook import tqdm as notebook_tqdm
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
# !!! If this cell runs for more than 1 min, you may want to stop it and rerun (cell only). I do not know why bit it works for me.

tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)

def inference(
    text,
    script,
    voice,
    voice_b,
    seed,
    split_by_newline,
):
    if text is None or text.strip() == "":
        with open(script.name) as f:
            text = f.read()
        if text.strip() == "":
            raise gr.Error("Please provide either text or script file with content.")

    if split_by_newline == "Yes":
        texts = list(filter(lambda x: x.strip() != "", text.split("\n")))
    else:
        texts = split_and_recombine_text(text)

    voices = [voice]
    if voice_b != "disabled":
        voices.append(voice_b)

    if len(voices) == 1:
        voice_samples, conditioning_latents = load_voice(voice)
    else:
        voice_samples, conditioning_latents = load_voices(voices)

    start_time = time.time()

    # all_parts = []
    for j, text in enumerate(texts):
        total_audio_frame = []
        # total_frame = np.concatenate([el[0] for el in frames_srate], axis=0)
        for audio_frame in tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset="ultra_fast",
            k=1
        ):
            # print("Time taken: ", time.time() - start_time)
            # all_parts.append(audio_frame)
            # yield (24000, audio_frame.cpu().detach().numpy())
            total_audio_frame.append(audio_frame.cpu().detach().numpy())
        yield (24000, np.concatenate(total_audio_frame, axis=0))

GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.




  WeightNorm.apply(module, name, dim)


In [3]:
def clearspeak(mathml: str) -> str:
    result = subprocess.run(
        ["./node_modules/.bin/sre"],
        input=mathml,
        text=True,
        capture_output=True,
        check=True
    )
    return result.stdout

def convert_markdown_with_latex(text: str) -> str:
    def replace_math(match):
        latex = (match.group(1) or match.group(2)).strip()
        mathml = latex2mathml.converter.convert(latex)
        return clearspeak(mathml)[:-1]

    pattern = re.compile(r"\$\$([^$]+)\$\$|\$([^$]+)\$")
    return re.sub(pattern, replace_math, text)


# !!! Test: Please check that clearspeak work for you here
markdown_text = "Test $$\\frac{1}{2}$$ and also $x^2 + y^2$. And here is \n\\n."
print(markdown_text)
print(convert_markdown_with_latex(markdown_text))

Test $$\frac{1}{2}$$ and also $x^2 + y^2$. And here is 
\n.
Test one half and also x squared plus y squared. And here is 
\n.


In [4]:
token_times = json.loads(Path("../evals_dir/step_times.json").read_text())

# texts = ["".join([el[1] for el in ch]).replace("\n", "") for ch in chunk(token_times, 5)]
# gen_times = [ch[-1][2] - ch[0][2] for ch in chunk(token_times, 5)]
# text = "\n".join(texts)

texts = [el[1].replace("\n", "") for el in token_times]
gen_times = [el[2] for el in token_times]
text = convert_markdown_with_latex("\n".join(texts))

print("Input texts: ", *texts, sep="\n--> ")
print("ClearSpeach: ", text)


# !!! Sometimes this code fails due to matrix dim mismatch (that is something wrong with tortois-tts). Just rerun cell.

frames_srate = []
spk_times = []

t0 = time.perf_counter()
for sample_rate, frame in inference(
    text=text,
    script=None,
    voice="freeman",
    voice_b="disabled",
    seed=42,
    split_by_newline="Yes",
):
    t1 = time.perf_counter()
    spk_times.append(t1 - t0)
    frames_srate.append((frame, sample_rate))
    t0 = time.perf_counter()
flag = False
ipd.clear_output()

In [5]:
print("Combined Audio:")
total_frame = np.concatenate([el[0] for el in frames_srate], axis=0)
ipd.display(ipd.Audio(total_frame, rate=frames_srate[0][1]))

# print("Fractured Audio:")
# for frame, rate in frames_srate:
#     ipd.display(ipd.Audio(frame, rate=rate))

Combined Audio:


In [6]:
assert len(gen_times) == len(spk_times), f"{len(gen_times)}, {len(spk_times)}"  

delays = []
total_delay = 0.0
speech_no_delay = sum(spk_times)
speech_with_delay = 0.0
shift = 0.0

for i, (t_gen, t_speak) in enumerate(zip(gen_times, spk_times)):
    ideal_start = sum(spk_times[:i]) + shift
    if t_gen > ideal_start:
        delay = t_gen - ideal_start
        delays.append(delay)
        shift += delay
        total_delay += delay
    speech_with_delay += t_speak

print(f"Total delay: {total_delay:.3f}s")
print(f"Speech time (no delay): {speech_no_delay:.3f}s")
print(f"Speech time (with delay): {speech_with_delay + total_delay:.3f}s")

Total delay: 2587.410s
Speech time (no delay): 28.843s
Speech time (with delay): 2616.254s
