Full pipeline

In [None]:
from google.colab import drive, files
import zipfile
import os
import wave
import shutil
import datetime

# Mount Google Drive
drive.mount('/content/drive')

# Define the drive path
drive_path = "/content/drive/MyDrive/cases_wav_formatted.zip"

# Clear existing files in the target directory
if os.path.exists('/content/TTS-TT2/wavs'):
    for filename in os.listdir('/content/TTS-TT2/wavs'):
        file_path = os.path.join('/content/TTS-TT2/wavs', filename)
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

# Create a script for audio processing using ffmpeg
with open('/content/audios.sh', 'w') as rsh:
    rsh.write('''\
for file in /content/TTS-TT2/wavs/*.wav
do
    ffmpeg -y -i "$file" -ar 22050 /content/tempwav/srtmp.wav -loglevel error
    ffmpeg -y -i /content/tempwav/srtmp.wav -c copy -fflags +bitexact -flags:v +bitexact -flags:a +bitexact -ar 22050 /content/tempwav/poop.wav -loglevel error
    rm "$file"
    mv /content/tempwav/poop.wav "$file"
    rm /content/tempwav/*
done
''')

# Change to the directory where the audios will be stored
os.makedirs('/content/TTS-TT2/wavs', exist_ok=True)
os.chdir('/content/TTS-TT2/wavs')

# Handle audio import
drive_path = drive_path.strip()
if drive_path:
    if os.path.exists(drive_path):
        print(f"\n\033[34m\033[1mAudio imported from Drive.\n\033[90m")
        if zipfile.is_zipfile(drive_path):
            !unzip -o -q -j "$drive_path" -d /content/TTS-TT2/wavs
        else:
            fp = drive_path + "/."
            !cp -a "$fp" "/content/TTS-TT2/wavs"
    else:
        print(f"\n\033[33m\033[1m[NOTICE] The path {drive_path} is not found, check for errors and try again.")
        print(f"\n\033[34m\033[1mUpload your dataset(audios)...")
        uploaded = files.upload()
else:
    print(f"\n\033[34m\033[1mUpload your dataset(audios)...")
    uploaded = files.upload()
    for fn in uploaded.keys():
        if zipfile.is_zipfile(fn):
            !unzip -o -q -j "$fn" -d /content/TTS-TT2/wavs
            !rm "$fn"

# Adjust directory if necessary
if os.path.exists("/content/TTS-TT2/wavs/wavs"):
    for file in os.listdir("/content/TTS-TT2/wavs/wavs"):
        !mv /content/TTS-TT2/wavs/wavs/"$file"  /content/TTS-TT2/wavs/"$file"

# Clear or create temporary directory
if os.path.exists('/content/tempwav'):
    shutil.rmtree('/content/tempwav')
os.mkdir('/content/tempwav')

# Process audio if required
if audio_processing:
    print(f"\n\033[37mMetadata removal and audio verification...")
    !bash /content/audios.sh

# Analyze audio files
totalduration = 0
wav_files = [x for x in os.listdir() if os.path.isfile(x) and not x.startswith('._')]
for file_name in wav_files:
    try:
        with wave.open(file_name, "rb") as wave_file:
            frames = wave_file.getnframes()
            rate = wave_file.getframerate()
            duration = frames / float(rate)
            totalduration += duration

            if duration >= 12:
                print(f"\n\033[33m\033[1m[NOTICE] {file_name} is longer than 12 seconds. Lack of RAM can occur in a large batch size!")
    except wave.Error as e:
        print(f"\n\033[31m\033[1m[ERROR] {file_name} is not a valid WAV file: {e}")

# Summary
wav_count = len(wav_files)
print(f"\n{wav_count} processed audios. Total duration: {str(datetime.timedelta(seconds=round(totalduration, 0)))}\n")

# Print final message
print("\n\033[32m\033[1mAll set, please proceed.")



In [None]:
import os
import torch

# Define model parameters
model_filename = 'test'
Training_file = "filelists/list.txt"
output_directory = '/content/drive/MyDrive/colab/outdir'
log_directory = '/content/TTS-TT2/logs'
log_directory2 = '/content/drive/My Drive/colab/logs'
checkpoint_path = os.path.join(output_directory, model_filename)

# Hyperparameters
hparams = {
    'training_files': Training_file,
    'validation_files': Training_file,
    'p_attention_dropout': 0.1,
    'p_decoder_dropout': 0.1,
    'decay_start': 15000,
    'A_': 3e-4,
    'B_': 8000,
    'C_': 0,
    'min_learning_rate': 1e-5,
    'batch_size': 5,
    'load_mel_from_disk': True,
    'ignore_layers': [],
    'epochs': 250,
    'cudnn_enabled': True,
    'cudnn_benchmark': True,
    'text_cleaners': ["english_cleaners"],
    'show_alignments': True,
}

# Optionally add CMUDict cleaners
use_cmudict = True
if use_cmudict:
    hparams['text_cleaners'].append("cmudict_cleaners")

# Ensure CUDA is configured correctly
torch.backends.cudnn.enabled = hparams['cudnn_enabled']
torch.backends.cudnn.benchmark = hparams['cudnn_benchmark']

# Display configuration
print("Model Filename:", model_filename)
print("Training File:", Training_file)
print("Hyperparameters:", hparams)
print("Output Directory:", output_directory)
print("Log Directory:", log_directory)
print("Checkpoint Path:", checkpoint_path)

# Note: Further training steps would be required here, such as initializing the model,
# loading data, and starting the training loop.


In [None]:
import os
import numpy as np
import librosa
import shutil

# Function to remove `._` files
def remove_dot_underscore_files(directory):
    for filename in os.listdir(directory):
        if filename.startswith('._'):
            file_path = os.path.join(directory, filename)
            os.remove(file_path)
            print(f"Removed {file_path}")

# Function to create Mel spectrograms from .WAV files
def create_mels():
    wav_dir = '/content/TTS-TT2/wavs'
    mel_dir = '/content/TTS-TT2/mels'
    os.makedirs(mel_dir, exist_ok=True)

    for wav_file in os.listdir(wav_dir):
        if wav_file.endswith('.wav') and not wav_file.startswith('._'):
            wav_path = os.path.join(wav_dir, wav_file)
            mel_path = os.path.join(mel_dir, wav_file.replace('.wav', '.npy'))

            try:
                # Load the audio file
                y, sr = librosa.load(wav_path, sr=22050)
                # Compute the Mel spectrogram
                mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
                mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

                # Save the Mel spectrogram as a .npy file
                np.save(mel_path, mel_spectrogram_db)
                print(f"Converted {wav_file} to Mel spectrogram and saved as {mel_path}")
            except Exception as e:
                print(f"Failed to process {wav_file}: {e}")

# Function to check the dataset for missing files
def check_dataset(hparams):
    training_files = hparams['training_files']
    validation_files = hparams['validation_files']
    
    missing_files = []
    
    def check_filelist(filelist):
        with open(filelist, 'r') as f:
            for line in f:
                mel_file = line.strip().split('|')[0]
                if not os.path.exists(mel_file):
                    missing_files.append(mel_file)

    check_filelist(training_files)
    check_filelist(validation_files)

    if missing_files:
        print(f"Missing Mel spectrogram files: {missing_files}")
    else:
        print("All Mel spectrogram files are present.")

# Parameters
generate_mels = True

# Remove `._` files
remove_dot_underscore_files('/content/TTS-TT2/wavs')

# Convert .WAV files to Mel spectrograms if required
if generate_mels:
    create_mels()

print("Checking for missing files")

# Replace .wav with .npy in filelists
!sed -i -- 's,.wav|,.npy|,g' {hparams['training_files']}; sed -i -- 's,.wav|,.npy|,g' {hparams['validation_files']}

# Check the dataset
check_dataset(hparams)


In [None]:
import os
import time
import math
from tqdm import tqdm
import torch



def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams, log_directory2, save_interval, backup_interval):
    """Training and validation logging results to tensorboard and stdout
    
    """
    if hparams['distributed_run']:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams['seed'])
    torch.cuda.manual_seed(hparams['seed'])

    model = load_model(hparams)
    learning_rate = hparams['learning_rate']
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=hparams['weight_decay'])

    if hparams['fp16_run']:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O2')

    if hparams['distributed_run']:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint 
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None and os.path.isfile(checkpoint_path):
        if warm_start:
            model = warm_start_model(
                checkpoint_path, model, hparams['ignore_layers'])
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams['use_saved_learning_rate']:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))
    else:
        pretrained_model_path = "/content/TTS-TT2/pretrained_model"
        if not os.path.isfile(pretrained_model_path):
            !/content/TTS-TT2/megadown.sh https://mega.nz/#!WXY3RILA!KyoGHtfB_sdhmLFoykG2lKWhh0GFdwMkk7OwAjpQHRo --o pretrained_model
        model = warm_start_model(pretrained_model_path, model, hparams['ignore_layers'])
        # download LJSpeech pretrained model 

    start_eposh = time.perf_counter()
    learning_rate = 0.0
    model.train()
    is_overflow = False
    
    for epoch in tqdm(range(epoch_offset, hparams['epochs'])):
        print("\nStarting Epoch: {} Iteration: {}".format(epoch, iteration))
        start_eposh = time.perf_counter() # eposh is russian, not a typo
        for i, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
            start = time.perf_counter()
            if iteration < hparams['decay_start']: learning_rate = hparams['A_']
            else: iteration_adjusted = iteration - hparams['decay_start']; learning_rate = (hparams['A_']*(math.exp(-iteration_adjusted/hparams['B_']))) + hparams['C_']
            learning_rate = max(hparams['min_learning_rate'], learning_rate) # output the largest number
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad


Sources used for code, models, ideas implemented: NVDIA NEMO Guides, Pony Preservation Project, NVIDIA Deep learning. 