In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import sys
import json
import numpy as np
import tensorflow as tf
from absl import logging
from athena.models.tts_transformer import TTSTransformer
#from athena.solver import BaseSolver
from athena.utils.checkpoint import Checkpoint
from athena.utils.learning_rate import WarmUpLearningSchedule, WarmUpAdam
from athena.utils.hparam import HParams, register_and_parse_hparams
from athena.utils.metric_check import MetricChecker
from athena.utils.misc import validate_seqs
from athena.metrics import CharactorAccuracy
from athena.data.datasets.speech_synthesis import SpeechSynthesisDatasetBuilder

In [None]:
!pip install kenlm

In [None]:
import pandas as pd
import functools
import librosa

In [None]:
DEFAULT_CONFIGS = {
    "batch_size": 32,
    "num_epochs": 20,
    "sorta_epoch": 1,
    "ckpt": None,
    "summary_dir": None,
    "solver_type": "asr",
    "solver_gpu": [0],
    "solver_config": None,
    "model": "speech_transformer",
    "num_classes": None,
    "model_config": None,
    "pretrained_model": None,
    "teacher_model": None,
    "optimizer": "warmup_adam",
    "optimizer_config": None,
    "convert_config": None,
    "num_data_threads": 1,
    "dataset_builder": "speech_recognition_dataset",
    "dev_dataset_builder": None,
    "trainset_config": None,
    "devset_config": None,
    "testset_config": None,
    "inference_config": None
}
Data_default_config = {
        "audio_config": {"type": "Fbank"},
        "text_config": {"type":"vocab", "model":"athena/utils/vocabs/ch-en.vocab"},
        "num_cmvn_workers": 1,
        "cmvn_file": None,
        "remove_unk": True,
        "input_length_range": [20, 50000],
        "output_length_range": [1, 10000],
        "speed_permutation": [1.0],
        "spectral_augmentation": None,
        "data_csv": None,
        "words": None
    }
def parse_config(config):
    """ parse config """
    p = register_and_parse_hparams(DEFAULT_CONFIGS, config, cls="main")
    logging.info("hparams: {}".format(p))
    return p

def parse_jsonfile(jsonfile):
    """ parse the jsonfile, output the parameters
    """
    config = None
    with open(jsonfile) as file:
        config = json.load(file)
    p = register_and_parse_hparams(DEFAULT_CONFIGS, config, cls="main")
    logging.info("hparams: {}".format(p))
    return p


In [None]:
class BaseSolver(tf.keras.Model):
    """Base Solver.
    """
    default_config = {
        "clip_norm": 100.0,
        "log_interval": 10,
        "enable_tf_function": True
    }
    def __init__(self, model, optimizer, sample_signature, eval_sample_signature=None,
                 config=None, **kwargs):
        super().__init__(**kwargs)
        self.model = model
        self.optimizer = optimizer
        self.metric_checker = MetricChecker(self.optimizer)
        self.sample_signature = sample_signature
        self.eval_sample_signature = eval_sample_signature
        self.hparams = register_and_parse_hparams(self.default_config, config, cls=self.__class__)

    @staticmethod
    def initialize_devices(solver_gpus=None):
        """ initialize hvd devices, should be called firstly """
        gpus = tf.config.experimental.list_physical_devices("GPU")
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # means we're running in GPU mode
        if len(gpus) != 0:
            # If the list of solver gpus is empty, the first gpu will be used.
            if len(solver_gpus) == 0:
                solver_gpus.append(0)
            assert len(gpus) >= len(solver_gpus)
            for idx in solver_gpus:
                tf.config.experimental.set_visible_devices(gpus[idx], "GPU")

    @staticmethod
    def clip_by_norm(grads, norm):
        """ clip norm using tf.clip_by_norm """
        if norm <= 0:
            return grads
        grads = [
            None if gradient is None else tf.clip_by_norm(gradient, norm)
            for gradient in grads
        ]
        return grads

    def train_step(self, samples):
        """ train the model 1 step """
        with tf.GradientTape() as tape:
            # outputs of a forward run of model, potentially contains more than one item
            outputs = self.model(samples, training=True)
            loss, metrics = self.model.get_loss(outputs, samples, training=True)
            total_loss = sum(list(loss.values())) if isinstance(loss, dict) else loss
        grads = tape.gradient(total_loss, self.model.trainable_variables)
        grads = self.clip_by_norm(grads, self.hparams.clip_norm)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss, metrics

    def train(self, dataset, total_batches=-1):
        """ Update the model in 1 epoch """
        train_step = self.train_step
        if self.hparams.enable_tf_function:
            print("please be patient, enable tf.function, it takes time ...")
            train_step = tf.function(train_step, input_signature=self.sample_signature)
        for batch, samples in enumerate(dataset.take(total_batches)):
            # train 1 step
            samples = self.model.prepare_samples(samples)
            loss, metrics = train_step(samples)
            if batch % self.hparams.log_interval == 0:
                print(self.metric_checker(loss, metrics), end='\r')
                self.model.reset_metrics()
    
    def train_and_eval(self, dataset, total_batches=-1, total_epoch=50):
        """ Update the model in 1 epoch """
        train_step = self.train_step
        if self.hparams.enable_tf_function:
            print("please be patient, enable tf.function, it takes time ...")
            train_step = tf.function(train_step, input_signature=self.sample_signature)
        epoch = 0
        
        for batch, samples in enumerate(dataset.take(total_batches)):
            # train 1 step
            samples = self.model.prepare_samples(samples)
            loss, metrics = train_step(samples)
            if batch % self.hparams.log_interval == 0:
                print(self.metric_checker(loss, metrics), end='\r')
                self.model.reset_metrics()

    def evaluate_step(self, samples):
        """ evaluate the model 1 step """
        # outputs of a forward run of model, potentially contains more than one item
        outputs = self.model(samples, training=False)
        loss, metrics = self.model.get_loss(outputs, samples, training=False)
        return loss, metrics

    def evaluate(self, dataset, epoch):
        """ evaluate the model """
        loss_metric = tf.keras.metrics.Mean(name="AverageLoss")
        loss, metrics = None, None
        evaluate_step = self.evaluate_step
        if self.hparams.enable_tf_function:
            print("please be patient, enable tf.function, it takes time ...")
            evaluate_step = tf.function(evaluate_step, input_signature=self.eval_sample_signature)
        self.model.reset_metrics()  # init metric.result() with 0
        for batch, samples in enumerate(dataset):
            samples = self.model.prepare_samples(samples)
            loss, metrics = evaluate_step(samples)
            if batch % self.hparams.log_interval == 0:
                print(self.metric_checker(loss, metrics, -2), end='\r')
            total_loss = sum(list(loss.values())) if isinstance(loss, dict) else loss
            loss_metric.update_state(total_loss)
        print(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch))
        self.model.reset_metrics()
        return loss_metric.result(), metrics

In [None]:
jsonfile ='/content/drive/MyDrive/AI/DATA/Donglinh/tts_transformer.json'
p = parse_jsonfile(jsonfile)

In [None]:
!pip install --ignore-installed /content/drive/MyDrive/AI/DATA/Donglinh/athena/dist/athena-0.1.0*.whl

Processing ./drive/MyDrive/AI/DATA/Donglinh/athena/dist/athena-0.1.0-cp37-cp37m-linux_x86_64.whl
Installing collected packages: athena
Successfully installed athena-0.1.0


In [None]:
testset_builder = SpeechSynthesisDatasetBuilder(p.testset_config)

In [None]:
trainset_builder = SpeechSynthesisDatasetBuilder(p.trainset_config)

In [None]:
trainset_builder.compute_cmvn_if_necessary(True)

100%|██████████| 11726/11726 [1:15:43<00:00,  2.58it/s]


<athena.data.datasets.speech_synthesis.SpeechSynthesisDatasetBuilder at 0x7f59a4d9c3d0>

In [None]:
rank = 0
rank_size = 1

In [None]:
trainset_builder.shard(rank_size, rank)

<athena.data.datasets.speech_synthesis.SpeechSynthesisDatasetBuilder at 0x7f59a4d9c3d0>

In [None]:
model = TTSTransformer(
        data_descriptions=trainset_builder,
        config=p.model_config,
    )

Model: "enc"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 512)         69120     
_________________________________________________________________
conv1d (Conv1D)              (None, None, 512)         1310720   
_________________________________________________________________
re_lu (ReLU)                 (None, None, 512)         0         
_________________________________________________________________
batch_normalization (BatchNo (None, None, 512)         2048      
_________________________________________________________________
dropout (Dropout)            (None, None, 512)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 512)         1310720 

In [None]:
optimizer = WarmUpAdam(p.optimizer_config)
checkpointer = Checkpoint(
        checkpoint_directory='/content/drive/MyDrive/AI/DATA/Donglinh/ckpt',
        model=model,
        optimizer=optimizer,
    )

In [None]:
solver = BaseSolver(
        model,
        optimizer,
        sample_signature=trainset_builder.sample_signature,
        eval_sample_signature=testset_builder.sample_signature,
        config=p.solver_config,
    )

In [None]:
epoch = int(checkpointer.save_counter)

In [None]:
while epoch < p.num_epochs:
    if rank == 0:
        logging.info(">>>>> start training in epoch %d" % epoch)
    if epoch >= p.sorta_epoch:
        trainset_builder.batch_wise_shuffle(p.batch_size)
    solver.train(trainset_builder.as_dataset(p.batch_size, p.num_data_threads))

    if rank == 0:
        logging.info(">>>>> start evaluate in epoch %d" % epoch)
    devset = devset_builder.as_dataset(p.batch_size, p.num_data_threads)
    loss, metrics = solver.evaluate(devset, epoch)

    if rank == 0:
        checkpointer(loss, metrics)

    epoch = epoch + 1



In [7]:
!git clone https://github.com/vlinhd11/athena .

Cloning into '.'...
remote: Enumerating objects: 4271, done.[K
remote: Counting objects: 100% (376/376), done.[K
remote: Compressing objects: 100% (259/259), done.[K
remote: Total 4271 (delta 198), reused 195 (delta 101), pack-reused 3895[K
Receiving objects: 100% (4271/4271), 7.09 MiB | 7.02 MiB/s, done.
Resolving deltas: 100% (2704/2704), done.
Checking out files: 100% (264/264), done.


In [None]:
!python setup.py bdist_wheel sdist