In [39]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]=""

In [40]:
import yaml
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor

import IPython.display as ipd

from textprocer.stress_modules import TextProcer
from pydub import AudioSegment
from pydub.effects import normalize


def get_normalized_text(text, max_text_length, max_sentence_length, lang, pc=0.0):
    """Return normalized and cleaned sentences"""
    sc = 1.0
    if lang == "en":
        sc = 1.0
    procers = {
        "ru": TextProcer(lang="ru"),
        "en": TextProcer(lang="en")
    }

    params = {
        'yo_chance': 1.0,
        'stress_chance': sc,
        'phoneme_chance': pc,
        'max_split_length': max_sentence_length,
        'expand_numbers': True,
        'to_lower': True,
        'sentences': [text],
    }

    result = procers[lang](**params)
    return " ".join(result['phoneme_sentences']).replace("*", "")

In [41]:
hifi_config = AutoConfig.from_pretrained('/home/newton/projects/TensorFlowTTS/examples/hifigan/conf/hifigan.v1.yaml')
hifi_gan = TFAutoModel.from_pretrained(
    config=hifi_config,
#     pretrained_path="/home/vera/projects/tts_v2/tts_2/TensorFlowTTS/multiband_melgan.v1_24k.yaml",
    pretrained_path="/home/newton/projects/TensorFlowTTS/examples/hifigan/exp/train.hifigan.v1/checkpoints/generator-960000.h5",
)

from tensorflow_tts.processor import RuslanProcessor
from tensorflow_tts.processor.ruslan import RUSLAN_SYMBOLS


processor = RuslanProcessor(
    "/home/newton/datasets/new_ruslan/ruslan_data",
    symbols=RUSLAN_SYMBOLS,
    cleaner_names="basic_cleaners",
)
config = AutoConfig.from_pretrained("../examples/fastspeech2/conf/fastspeech2.v1.yaml")
fast_speech2 = TFAutoModel.from_pretrained(
     config=config,
     pretrained_path="/home/newton/checkpoints/fs_narcos/checkpoints/model-160000.h5",
#      pretrained_path="/home/newton/checkpoints/fs_new_urgant/checkpoints/model-110000.h5",
     enable_tflite_convertible=True
)

In [53]:
def convert_to_tflite(model, name="model", quantize=True):
    # Concrete Function
    concrete_function = model.inference_tflite.get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions(
      [concrete_function]
    )
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                          ]
#                                            tf.lite.OpsSet.SELECT_TF_OPS]
#     converter.target_spec.supported_types = [tf.float16] 
    # it has bug if you use tf.float16, see https://github.com/TensorSpeech/TensorFlowTTS/issues/346#issuecomment-728656417
    # This colab doesn't care about the latency, so it compressed the model with quantization. 8 bit run on desktop will slow.
    
    
    if not quantize:
        converter.target_spec.supported_types = [tf.float32]

    tflite_model = converter.convert()

    saved_path = name + '_quan.tflite' if quantize else name + '.tflite'

    # Save the TF Lite model.
    with open(saved_path, 'wb') as f:
    # with open('fastspeech_quant.tflite', 'wb') as f:
        f.write(tflite_model)

    print('Model: %s size is %f MBs.' % (name, len(tflite_model) / 1024 / 1024.0) )

    return saved_path

In [54]:
fastspeech2_tflite_path = convert_to_tflite(fast_speech2, "fastspeech2")
hifi_gan_tflite_path = convert_to_tflite(hifi_gan, "hifi_gan")

# fastspeech2_tflite_path = convert_to_tflite(fast_speech2, "fastspeech2", quantize=False)
# hifi_gan_tflite_path = convert_to_tflite(hifi_gan, "hifi_gan", quantize=False)

2022-01-05 10:39:54.043547: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2022-01-05 10:39:54.043698: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2022-01-05 10:39:54.073485: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1137] Optimization results for grappler item: graph_to_optimize
  function_optimizer: function_optimizer did nothing. time = 0.014ms.
  function_optimizer: function_optimizer did nothing. time = 0.001ms.

2022-01-05 10:39:56.758493: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:351] Ignored output_format.
2022-01-05 10:39:56.758528: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:354] Ignored drop_control_dependency.
loc(callsite("dropout_67/dropout/random_uniform/RandomUniform"("/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/keras/layers/core.py":212:0) at callsite("/home/newton/anaconda3/envs/tts1/lib

ConverterError: /home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/keras/layers/core.py:212:0: error: 'tf.RandomUniform' op is neither a custom op nor a flex op
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/keras/engine/base_layer.py:1037:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow_tts/models/fastspeech2.py:257:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:983:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py:668:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:1007:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3298:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3463:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3066:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py:759:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/keras/layers/core.py:212:0: note: Error code: ERROR_NEEDS_FLEX_OPS
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/keras/layers/core.py:212:0: error: 'tf.RandomUniform' op is neither a custom op nor a flex op
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/keras/engine/base_layer.py:1037:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow_tts/models/fastspeech2.py:254:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:983:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py:668:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:1007:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3298:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3463:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3066:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py:759:0: note: called from
/home/newton/anaconda3/envs/tts1/lib/python3.8/site-packages/keras/layers/core.py:212:0: note: Error code: ERROR_NEEDS_FLEX_OPS
<unknown>:0: error: failed while converting: 'main': 
Some ops are not supported by the native TFLite runtime, you can enable TF kernels fallback using TF Select. See instructions: https://www.tensorflow.org/lite/guide/ops_select 
TF Select ops: RandomUniform
Details:
	tf.RandomUniform(tensor<3xi32>) -> (tensor<?x?x384xf32>) : {device = "", seed = 0 : i64, seed2 = 0 : i64}



# INFER

In [45]:
# Load the TFLite model and allocate tensors.
import tflite_runtime.interpreter as tflite

from tensorflow_tts.processor import RuslanProcessor
from tensorflow_tts.processor.ruslan import RUSLAN_SYMBOLS


processor = RuslanProcessor(
    "/home/newton/datasets/new_ruslan/ruslan_data",
    symbols=RUSLAN_SYMBOLS,
    cleaner_names="basic_cleaners",
)

def prepare_input(input_ids):
    input_ids = np.expand_dims(np.array(input_ids, np.int32), 0)
    return (input_ids,
            np.array([0], np.int32),
            np.array([1.0], np.float32),
            np.array([1.0], np.float32),
            np.array([1.0], np.float32),)


# Test the model on random input data.
def infer(input_ids, interpreter):
    # Get input and output tensors.
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    interpreter.resize_tensor_input(input_details[0]['index'], [1, len(input_ids)])
    interpreter.resize_tensor_input(input_details[1]['index'], [1])
    interpreter.resize_tensor_input(input_details[2]['index'], [1])
    interpreter.resize_tensor_input(input_details[3]['index'], [1])
    interpreter.resize_tensor_input(input_details[4]['index'], [1])


    interpreter.allocate_tensors()

    # input_data = prepare_input(input_ids)
    input_data = prepare_input(input_ids)
    for i, detail in enumerate(input_details):
        interpreter.set_tensor(detail['index'], input_data[i])

    interpreter.invoke()

    # The function `get_tensor()` returns a copy of the tensor data.
    # Use `tensor()` in order to get a pointer to the tensor.
    return (interpreter.get_tensor(output_details[0]['index']),
            interpreter.get_tensor(output_details[1]['index']))
  
def vocoder_infer(mel):
    # Get input and output tensors.
    v_input_details = vocoder_interpreter.get_input_details()
    v_output_details = vocoder_interpreter.get_output_details()

    vocoder_interpreter.resize_tensor_input(v_input_details[0]['index'], mel.shape)
    vocoder_interpreter.allocate_tensors()

    vocoder_interpreter.set_tensor(v_input_details[0]['index'], mel)

    vocoder_interpreter.invoke()

    return vocoder_interpreter.get_tensor(v_output_details[0]['index'])

In [50]:
text = get_normalized_text("Мокер попытался выровнить направление бес+еды", 10000, 100, "ru")
input_ids = processor.text_to_sequence(text)

In [51]:
input_ids

[26,
 28,
 24,
 18,
 30,
 12,
 29,
 28,
 29,
 41,
 32,
 2,
 13,
 25,
 31,
 45,
 12,
 15,
 41,
 30,
 28,
 15,
 27,
 22,
 32,
 42,
 12,
 27,
 13,
 29,
 30,
 13,
 15,
 25,
 2,
 18,
 27,
 22,
 18,
 12,
 14,
 18,
 31,
 2,
 18,
 17,
 41,
 8,
 46]

In [47]:
fs_interpreter = tf.lite.Interpreter(model_path="fastspeech2_quan.tflite")
vocoder_interpreter = tf.lite.Interpreter(model_path="hifi_gan_quan.tflite")

# fs_interpreter = tflite.Interpreter(model_path="fastspeech2.tflite")
# vocoder_interpreter = tflite.Interpreter(model_path="hifi_gan.tflite")

INFO: TfLiteFlexDelegate delegate: 2 nodes delegated out of 923 nodes with 1 partitions.



In [48]:
%%time
decoder_output_tflite, mel_output_tflite = infer(input_ids, fs_interpreter) 
# audio_before_tflite = vocoder_infer(decoder_output_tflite)[0, :, 0]
audio_after_tflite = vocoder_infer(mel_output_tflite)[0, :, 0]
ipd.Audio(data=audio_after_tflite, rate=22050)

CPU times: user 2min 29s, sys: 118 ms, total: 2min 29s
Wall time: 2min 29s


In [19]:
fs_interpreter = tf.lite.Interpreter(model_path="fastspeech2.tflite")
vocoder_interpreter = tf.lite.Interpreter(model_path="hifi_gan.tflite")

INFO: TfLiteFlexDelegate delegate: 2 nodes delegated out of 921 nodes with 1 partitions.



In [20]:
decoder_output_tflite, mel_output_tflite = infer(input_ids, fs_interpreter) 
# audio_before_tflite = vocoder_infer(decoder_output_tflite)[0, :, 0]
audio_after_tflite = vocoder_infer(mel_output_tflite)[0, :, 0]
ipd.Audio(data=audio_after_tflite, rate=22050)

# AWS

In [None]:
fs_interpreter = tflite.Interpreter(model_path="fastspeech2_quan.tflite")
vocoder_interpreter = tflite.Interpreter(model_path="hifi_gan_quan.tflite")

In [None]:
def infer(input_ids, interpreter):
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    interpreter.resize_tensor_input(input_details[0]['index'], [1, len(input_ids)])
    interpreter.resize_tensor_input(input_details[1]['index'], [1])
    interpreter.resize_tensor_input(input_details[2]['index'], [1])
    interpreter.resize_tensor_input(input_details[3]['index'], [1])
    interpreter.resize_tensor_input(input_details[4]['index'], [1])
    interpreter.allocate_tensors()
    input_data = prepare_input(input_ids)
    for i, detail in enumerate(input_details):
        interpreter.set_tensor(detail['index'], input_data[i])
    interpreter.invoke()
    return (interpreter.get_tensor(output_details[0]['index']),
            interpreter.get_tensor(output_details[1]['index']))
def vocoder_infer(mel):
    v_input_details = vocoder_interpreter.get_input_details()
    v_output_details = vocoder_interpreter.get_output_details()
    vocoder_interpreter.resize_tensor_input(v_input_details[0]['index'], mel.shape)
    vocoder_interpreter.allocate_tensors()
    vocoder_interpreter.set_tensor(v_input_details[0]['index'], mel)
    vocoder_interpreter.invoke()
    return vocoder_interpreter.get_tensor(v_output_details[0]['index'])
def prepare_input(input_ids):
    input_ids = np.expand_dims(np.array(input_ids, np.int32), 0)
    return (input_ids,
            np.array([0], np.int32),
            np.array([1.0], np.float32),
            np.array([1.0], np.float32),
            np.array([1.0], np.float32),)

In [None]:
def prepare_input(input_ids):
    input_ids = np.expand_dims(np.array(input_ids, np.int32), 0)
    return (input_ids,
            np.array([0], np.int32),
            np.array([1.0], np.float32),
            np.array([1.0], np.float32),
            np.array([1.0], np.float32),)