In [1]:
"""modified from https://gist.github.com/endes0/0967d7c5bb1877559c4ae84be05e036c"""
from tika import parser

import torchaudio
import argparse
from sanitize_filename import sanitize
import re
from pathlib import Path
from tqdm.auto import tqdm
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices
from tortoise.utils.tokenizer import VoiceBpeTokenizer

import torch
import json
from dataclasses import dataclass
# import pysbd
from typing import List
from loguru import logger
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

@dataclass
class Writer:
    out_dir: Path
    # tts: TTS
    
    def __post_init__(self):
        self.m3u = open(self.out_dir / 'playlist.m3u', 'w')
        self.m3u.write('#EXTM3U\n')
        self.chapter = 1

    def write_chapter(self, waveforms: torch.tensor, SAMPLE_RATE=24000):
        wav_f = self.out_dir / f'{self.chapter}.ogg'
        torchaudio.save(wav_f, waveforms.cpu(), SAMPLE_RATE)
        self.m3u.write(f'{wav_f}\n')
        self.chapter += 1
        return wav_f


    def close(self):
        self.m3u.close()

def split_into_sentences(text, tokenizer) -> List[str]:        
    limit = 200
    chunk_limit = limit
    splitter = RecursiveCharacterTextSplitter(
        length_function=lambda x: len(tokenizer.encode(x)),
        chunk_size=chunk_limit,
        chunk_overlap=0,
        keep_separator=True,
        strip_whitespace=True,
        separators=[
            "\n\n", "\n", "\xa0", '<div>', '<p>', '<br>', "\r", ".",  "!", "?", 
            '"', "'", "‘", "’", "“", "”", "„", "‟",  
            "(", ")", "[", "]", "{", "}", 
            "…", ":", ";", "—", "   "
            " ", '' # these ensure that there is always something to split by so chunks are always at limit
    ],
    )
    texts = splitter.split_text(text)
    ls = [splitter._length_function(x) for x in texts]
    logger.debug(f'split lengths {ls}. max={max(ls)} chunk_limit={chunk_limit}')
    assert all([l<=limit for l in ls]), 'all senteces should be below limit'
    return texts


In [3]:
__file__ = '../01_epub_tortise.ipynb'
root_dir = Path(__file__).resolve().absolute().parent
root_dir


PosixPath('/media/wassname/SGIronWolf/projects5/tts-ai/use-tts-mjc')

In [4]:
# Get the command line arguments
parser2 = argparse.ArgumentParser()
parser2.add_argument('--epub', type=Path, 
                     default=root_dir/'data/A Short Guide to the Inner Citadel - Massimo Pigliucci.epub',
                    #  default=root_dir/'data/golden_saying_of_epictetus.epub',
                    help='PDF file to read')
parser2.add_argument('-o', '--out', type=Path, default=None, help='Output folder')
parser2.add_argument('-f', '--force', action='store_true', default=False, help='Overwrite')
parser2.add_argument('-t', '--test', action='store_true', default=False, help='Overwrite')
parser2.add_argument('-l', '--limit', type=int, default=400,
                    help='Maximum number of characters to synthesize at once')
parser2.add_argument('-m', '--model', type=str, 
                    default="tts_models/multilingual/multi-dataset/xtts_v1",
                    # default='facebook/fastspeech2-en-ljspeech',
                    help='fairseq model to use from HuggingFace Hub')
parser2.add_argument('-s', '--speaker', type=Path, default=root_dir / "data/speakers/donaldrobertson.wav",
                    help='Speaker wav to use from the model')
args = parser2.parse_args([])

if args.out is None:
    from datetime import datetime
    timestamp = datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
    args.out = root_dir / 'out' / (sanitize(args.epub.stem).replace(' ', '_').lower() + timestamp)

# load epib
parsed = parser.from_file(str(args.epub))
text = parsed["content"]
if args.test:
    text = text[:1000]


# make output directory
out_dir = Path(args.out)
if out_dir.exists():
    if not args.force:
        logger.warning('Output folder already exists. Use -f to overwrite.')
        exit(1)
    else:
        for f in out_dir.glob('*'):
            f.unlink()
        out_dir.rmdir()
out_dir.mkdir()
logger.info(f'Output folder: {out_dir}')


[32m2023-10-08 11:19:11.630[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m43[0m - [1mOutput folder: /media/wassname/SGIronWolf/projects5/tts-ai/use-tts-mjc/out/a_short_guide_to_the_inner_citadel_-_massimo_pigliucci20231008_03-19-11[0m


In [5]:


# write metadata to dir
from json_tricks import dump, dumps, load, loads, strip_comments
f_metadata = out_dir / 'metadata.json'
with open(f_metadata, 'w') as fo:
    dump(dict(
        epub_metadata=parsed['metadata'],
        args=args.__dict__,
        
    ), fo, indent=4)

# should be torch tensors containing 22.05kHz waveform data.
# see https://github.com/neonbjb/tortoise-tts/blob/5bbb0e0b97ea2f62c12e90402e8ad4faee55e697/tortoise/api.py#L365C82-L365C140
ref, INPUT_SAMPLE_RATE = torchaudio.load(args.speaker)
reference_clips = [ref[..., -400000:]] # take just the last ~12 seconds

# load model
use_cuda = False if args.test else torch.cuda.is_available()
logger.info(f'use_cuda {use_cuda}')


tts = TextToSpeech(use_deepspeed=True, kv_cache=True, half=True)


[32m2023-10-08 11:19:11.774[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1muse_cuda True[0m


[2023-10-08 11:19:20,690] [INFO] [logging.py:93:log_dist] [Rank -1] DeepSpeed info: version=0.8.3, git-hash=unknown, git-branch=unknown
[2023-10-08 11:19:20,692] [INFO] [logging.py:93:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
Installed CUDA version 11.5 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination


Using /home/wassname/.cache/torch_extensions/py310_cu117 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/wassname/.cache/torch_extensions/py310_cu117/transformer_inference/build.ninja...
Building extension module transformer_inference...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load transformer_inference op: 0.051093101501464844 seconds
[2023-10-08 11:19:21,249] [INFO] [logging.py:93:log_dist] [Rank -1] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 1024, 'intermediate_size': 4096, 'heads': 16, 'num_hidden_layers': -1, 'fp16': True, 'pre_layer_norm': True, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'q_int8': False, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': <ActivationFuncType.GELU: 1>, 'specialized_mode': False, 'training_mp_size': 1, 'bigscience_bloom': False, 'max_out_tokens': 1024, 'scale_attn_by_inverse_layer_idx': False, 'enable_qkv_quantization': False, 'use_mup': False, 'return_single_tuple': False}
Installed CUDA version 11.5 does not match the version torch was compiled with 11.7 but since th

Loading extension module transformer_inference...
Using /home/wassname/.cache/torch_extensions/py310_cu117 as PyTorch extensions root...
No modifications detected for re-loaded extension module transformer_inference, skipping build step...
Loading extension module transformer_inference...


In [6]:
INPUT_SAMPLE_RATE
OUTPUT_SAMPLE_RATE = 24000


In [7]:
tokenizer = tts.tokenizer
segs = split_into_sentences(text, tokenizer)
waveforms = []
writer = Writer(out_dir)
for i, t in enumerate(tqdm(segs, desc='chunks')):
    t = t.replace('\n', ' ').strip()
    # Skip empty text
    if t == None or t == '':
        continue
    # check if contains words or numbers
    if not re.search('[a-zA-Z0-9]', t):
        logger.debug(f'Skipping text without words or numbers `{t}`')
        continue
    logger.debug(f'current sentence `{t}`')
    
    wav_t = tts.tts_with_preset(t, voice_samples=reference_clips, preset='fast', verbose=i==0) # ultra_fast, fast, standard
    wav = wav_t.cpu()
    waveforms.append(wav)
    
    len_wav = sum([w.shape[-1] for w in waveforms])
    if len_wav > 10000000//4:  # ~20G of RAM, ~2 minutes of audio output, ~7 minutes to generate
        wavs = torch.concat(waveforms, dim=-1).cpu().squeeze(0)
        wav_f = writer.write_chapter(wavs, OUTPUT_SAMPLE_RATE)
        logger.warning(f"wrote chapter {wav_f}")
        waveforms = []
        
if len(waveforms):  
    writer.write_chapter(waveforms)
writer.close()


[32m2023-10-08 11:19:24.549[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36msplit_into_sentences[0m:[36m41[0m - [34m[1msplit lengths [22, 185, 169, 140, 123, 147, 18, 172, 30, 137, 112, 157, 174, 157, 104, 124, 125, 138, 23, 194, 120, 171, 197, 5, 126, 124, 134, 129, 166, 135, 174, 186, 130, 75, 75, 115, 97, 73, 73, 163, 114, 93, 143, 94, 147, 130, 24, 169, 142, 133, 73, 192, 136, 134, 73, 131, 89, 31, 60, 165, 1, 175, 120, 162, 96, 1, 189, 174, 59, 92, 163, 46, 150, 86, 176, 25, 196, 18, 124, 177, 139, 143, 96, 170, 51, 175, 191, 156, 186, 171, 99, 108, 17, 189, 39, 20, 144, 140, 161, 96, 82, 123, 187, 106, 116, 84, 194, 191, 110, 117, 184, 104, 140, 102, 155, 1, 197, 80, 95, 198, 191, 129, 193, 177, 113, 116, 144, 143, 158, 118, 124, 32, 190, 171, 158, 78, 148, 58, 152, 102, 135, 55, 177, 136, 138, 182, 24, 76, 158, 121, 154, 165, 172, 67, 104, 119, 123, 157, 189, 105, 43, 170, 58, 168, 190, 137, 199, 163, 41, 111, 17, 186, 112, 199, 170, 183, 149, 156, 131, 88, 160, 163, 

Generating autoregressive samples..




------------------------------------------------------
Free memory : 8.120789 (GigaBytes)  
Total memory: 10.731750 (GigaBytes)  
Requested memory: 1.687500 (GigaBytes) 
Setting maximum total tokens (input + output) to 1024 
------------------------------------------------------


100%|██████████| 12/12 [00:03<00:00,  3.42it/s]


Computing best candidates using CLVP


100%|██████████| 12/12 [00:01<00:00,  9.13it/s]


Transforming autoregressive outputs into audio..


100%|██████████| 80/80 [00:02<00:00, 27.28it/s]
chunks:   0%|          | 1/389 [00:10<1:09:21, 10.72s/it][32m2023-10-08 11:19:35.276[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [34m[1mcurrent sentence `A Short Guide to The Inner Citadel On Pierre Hadot’s Classic Analysis of Marcus Aurelius’ Meditations By Massimo Pigliucci   © Massimo Pigliucci, 2021   A Short Guide to The Inner Citadel — On Pierre Hadot’s Classic Analysis of Marcus Aurelius’ Meditations`[0m
chunks:   1%|          | 2/389 [01:17<4:41:40, 43.67s/it][32m2023-10-08 11:20:42.010[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [34m[1mcurrent sentence `By Massimo Pigliucci, K.D. Irani Professor of Philosophy, the City College of New York    Stoa Nova Publications   Cover: Pierre Hadot, Wikipedia   If you like this free booklet, please consider supporting my writings at Patreon or Medium figsinwinter.blog`[0m
chunks:   1%|          | 3/389 [02:17<5:27

In [None]:
# Test

len_wav = sum([w.shape[-1] for w in waveforms])
print(len_wav)

wavs = torch.concat(waveforms, dim=-1).cpu().squeeze(0)
writer.write_chapter(wavs)
