In [6]:
# default_exp exec.run_editts

In [41]:
# export
import argparse
import sys
import librosa  # NOTE(zach): importing torch before librosa causes LLVM issues for some unknown reason.
import torch
import csv
import re
import json

import matplotlib.pyplot as plt
import soundfile as sf
from scipy.io.wavfile import write, read
import numpy as np

%matplotlib inline

from uberduck_ml_dev.vendor.tfcompat.hparam import HParams
from uberduck_ml_dev.models.common import MelSTFT
from uberduck_ml_dev.models.gradtts import GradTTS, DEFAULTS as GRADTTS_DEFAULTS
from uberduck_ml_dev.vocoders.hifigan import HiFiGanGenerator
from uberduck_ml_dev.utils.plot import plot_spectrogram
from uberduck_ml_dev.utils.audio import (
    overlay_stereo,
    stereo_to_mono,
    mono_to_stereo,
    to_int16,
    resample,
)


def parse_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", help="Path to JSON config")
    args = parser.parse_args(args)
    return args

In [3]:
def _rms(audio):
    return np.sqrt(np.mean(audio ** 2))


def _dbfs(audio):
    return 20 * np.log10(_rms(audio))


def _db_to_float(float):
    return 10 ** (float / 20)

In [4]:
CUSTOMIZATIONS = {
    "first_name": {
        "start_time": 1.25,
        "end_time": 1.70,
        "match_start": False,
    },
    "yearly_prizes": {
        "start_time": 9.768,
        "end_time": 10.970,
    },
    "big_bank_years": {
        "start_time": 12.965,
        "end_time": 13.983,
    },
    "lucky_number": {
        "start_time": 18.661,
        "end_time": 19.315,
    },
    "join_time": {
        "start_time": 29.655,
        "end_time": 30.108,
    },
    "best_month": {
        "start_time": 57.074,
        "end_time": 57.558,
        "match_start": False,
    },
    "best_month_apy": {
        "start_time": 57.814,
        "end_time": 58.413,
    },
    "number_of_referrals": {
        "start_time": 63.215,
        "end_time": 64.135,
    },
    "total_tickets": {
        "start_time": 71.247,
        "end_time": 72.564,
    },
}

In [8]:
# export
def run(hparams):
    # Create model
    model = GradTTS(hparams)
    model.load_state_dict(torch.load(hparams.checkpoint))
    model = model.cuda()

    stft = MelSTFT(
        filter_length=hparams.filter_length,
        hop_length=hparams.hop_length,
        win_length=hparams.win_length,
        n_mel_channels=hparams.n_feats,
        sampling_rate=hparams.sampling_rate,
        mel_fmin=hparams.mel_fmin,
        mel_fmax=hparams.mel_fmax,
        padding=(hparams.filter_length - hparams.hop_length) // 2,
    )

    hifigan = HiFiGanGenerator(
        config=hparams.hifigan_config,
        checkpoint=hparams.hifigan_checkpoint,
        cudnn_enabled=True,
    )

    # Convert vocals to 22kHz stereo audio for EdiTTS
    vocal_data, vocal_sample_rate = librosa.load(
        hparams.reference_vocals, sr=22050, mono=True
    )

    # We want to read in the beats as original 44kHz stereo audio
    beats_data, beats_sample_rate = librosa.load(
        hparams.reference_beats, sr=44100, mono=False
    )
    vocal_data = vocal_data[: 5 * 22050]
    beats_data = beats_data[:, : 5 * 44100]

    vocal_data = to_int16(vocal_data)

    audio_norm = torch.FloatTensor(vocal_data) / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    melspec_original = stft.mel_spectrogram(audio_norm).cuda()

    with open(hparams.customizations, "r") as csvfile:
        datareader = csv.reader(csvfile)
        for i, row in enumerate(datareader):
            regex = r"\|(.*?)\|"
            substitution = f"| {row[0]} |"
            new_transcription = re.sub(regex, substitution, hparams.transcription)
            print(new_transcription)
            y_dec1, y_dec2, y_dec_edit, y_dec_cat = model.infer_editts_edit_content(
                hparams.transcription,
                new_transcription,
                n_timesteps=10,
                symbol_set="gradtts",
                mel1=melspec_original.cuda(),
                i1=int(
                    CUSTOMIZATIONS["first_name"]["start_time"]
                    * hparams.sampling_rate
                    / hparams.hop_length
                ),
                j1=int(
                    CUSTOMIZATIONS["first_name"]["end_time"]
                    * hparams.sampling_rate
                    / hparams.hop_length
                ),
                desired_time=CUSTOMIZATIONS["first_name"]["end_time"]
                - CUSTOMIZATIONS["first_name"]["start_time"],
            )

            personalized_vocals = hifigan.infer(y_dec_edit)

            #   VOLUME MASK AND MULTIPLIER
            volume_multiplier = np.ones(personalized_vocals.shape[-1], dtype=np.int16)
            volume_boundary_0 = int(
                CUSTOMIZATIONS["first_name"]["start_time"] * hparams.sampling_rate
            )
            volume_boundary_1 = int(
                CUSTOMIZATIONS["first_name"]["end_time"] * hparams.sampling_rate
            )
            #             max_volume = personalized_vocals[volume_boundary_0:volume_boundary_1].max()
            #             print(audio_norm.squeeze().shape)
            #             print(personalized_vocals.shape)

            target_dbfs = _dbfs(personalized_vocals[:volume_boundary_0])
            current_dbfs = _dbfs(
                personalized_vocals[volume_boundary_0:volume_boundary_1]
            )
            coeff = _db_to_float(target_dbfs - current_dbfs)
            print(f"coeff {coeff}")
            #             personalized_vocals = personalized_vocals * coeff
            # #             print(f"max volume: {max_volume}")
            #             print(f"rms_volume_synthetic: {rms_volume_synthetic}")
            #             print(f"rms_volume_target: {rms_volume_target}")

            # #             volume_scale = hparams.max_wav_value / max_volume
            #             volume_scale = rms_volume_target / rms_volume_synthetic
            #             print(f"volume_scale: {volume_scale}")
            volume_multiplier[volume_boundary_0:volume_boundary_1] = coeff

            personalized_vocals = personalized_vocals * volume_multiplier
            #             print(f"after: {personalized_vocals.shape}")

            personalized_vocals = personalized_vocals / hparams.max_wav_value
            #             personalized_vocals = resample(
            #                 personalized_vocals, hparams.sampling_rate, 44100
            #             )
            #             personalized_vocals = mono_to_stereo(personalized_vocals)

            #             final_audio = overlay_stereo(personalized_vocals, beats_data)
            #             sf.write(f"{hparams.log_dir}/edited_{i}.wav", final_audio.T, 44100)
            sf.write(f"{hparams.log_dir}/edited_{i}.wav", personalized_vocals, 22050)

        plt.show()

In [9]:
# export
try:
    from nbdev.imports import IN_NOTEBOOK
except:
    IN_NOTEBOOK = False
if __name__ == "__main__" and not IN_NOTEBOOK:
    args = parse_args(sys.argv[1:])
    config = GRADTTS_DEFAULTS.values()
    if args.config:
        with open(args.config) as f:
            config.update(json.load(f))
    hparams = HParams(**config)
    run(hparams)

In [62]:
config = GRADTTS_DEFAULTS.values()
with open("../configs/editts.json") as f:
    config.update(json.load(f))
hparams = HParams(**config)
run(hparams)

This is a sample toast for | Trevor | doing a rap thing
w_ceil.shape: torch.Size([91])
Emphases: [[50, 61]]
w_slice: tensor([2., 2., 3., 3., 4., 3., 6., 6., 2., 5., 2.], device='cuda:0')
desired_time: 0.44999999999999996
self.sampling_rate: 22050
self.hop_length: 256
TS: 1.019993782043457
coeff 0.8176887822348796
This is a sample toast for | David | doing a rap thing
w_ceil.shape: torch.Size([91])
Emphases: [[50, 61]]
w_slice: tensor([4., 5., 6., 4., 4., 2., 4., 2., 2., 3., 2.], device='cuda:0')
desired_time: 0.44999999999999996
self.sampling_rate: 22050
self.hop_length: 256
TS: 1.019993782043457
weight norm already removed
coeff 0.9064555322703504
This is a sample toast for | Adam | doing a rap thing
w_ceil.shape: torch.Size([89])
Emphases: [[50, 59]]
w_slice: tensor([4., 3., 3., 3., 2., 2., 2., 2., 2.], device='cuda:0')
desired_time: 0.44999999999999996
self.sampling_rate: 22050
self.hop_length: 256
TS: 1.6852072477340698
weight norm already removed
coeff 2.109860974662672
This is a 

  


ValueError: cannot convert float NaN to integer

In [None]:
# def overlay(audio1, audio2):
#     audio1_padded = np.pad(audio1, (0,max(0,len(audio2)-len(audio1))))
#     audio2_padded = np.pad(audio2, (0,max(0,len(audio1)-len(audio2))))
#     return audio1_padded + audio2_padded

In [None]:
from pydub import AudioSegment
import numpy as np

wav1, s1 = read("../data/144.wav")
print(s1.min())
print(s1.max())
# wav2, s2 = librosa.load("../data/vocal_trim.wav", sr=22050)

assert s1 == s2, "sample rates must be equal"
overlay_wav = overlay(wav1, wav2)

In [None]:
import IPython.display as ipd

ipd.Audio(overlay_wav, rate=22050)  # load a local WAV file

In [11]:
ipd.Audio(wav1, rate=22050)  # load a local WAV file
#

NameError: name 'ipd' is not defined

In [12]:
ipd.Audio(wav2, rate=22050)  # load a local WAV file

NameError: name 'ipd' is not defined

In [None]:
x = np.array([1, 1, 1])
y = np.array([2, 2, 2, 2, 2, 2])

x = np.pad(x, (0, len(y) - len(x)))
y = np.pad(y, (0, len(x) - len(y)))

print(x, y)
print(z.shape)

print(z + y)