# TalkNet Training (offline)

To train a 22KHz TalkNet, run the cells below and follow the instructions.

The notebook will automatically resume training any models from the last saved checkpoint. If you're resuming from a new session, always re-run step 1 first.

**Step 1:** Configure training data paths. Create the following and change the paths below:
* A dataset of .wav files, packaged as a .zip or .tar file
* Training and validation filelists, in LJSpeech format with relative paths (note: ARPABET transcripts are not supported)
* An output path for checkpoints

In [1]:
### CHANGE THESE ###

dataset_path = "/home/beast/Documents/GitHub/AutomaticTikTalk/example/"
train_filelist = "/home/beast/Documents/GitHub/AutomaticTikTalk/example/train_filelist.txt"
val_filelist = "/home/beast/Documents/GitHub/AutomaticTikTalk/example/val_filelist.txt"
output_dir = "/home/beast/Documents/GitHub/AutomaticTikTalk/"

### ------------ ###

import os
import gdown
import platform

assert os.path.exists(dataset_path), "Cannot find dataset"
assert os.path.exists(train_filelist), "Cannot find training filelist"
assert os.path.exists(val_filelist), "Cannot find validation filelist"
if not os.path.exists(output_dir):
   os.makedirs(output_dir)

if os.getcwd() != output_dir:
    cwd = os.getcwd()
else:
    os.chdir(cwd)
    
# Download pre-trained models
if not os.path.exists("talknet_spect.nemo"):
    print("Downloading pre-trained models...")
    zip_path = "TalkNet_Pretrained.zip"
    if not os.path.exists(zip_path) or os.stat(zip_path).st_size < 100:
        d = 'https://drive.google.com/uc?id='
        gdown.download(d+"19wSym9mNEnmzLS9XdPlfNAW9_u-mP1hR", zip_path, quiet=False)
    print(platform.system())
    if "Windows" in platform.system():
        !tar -xf {zip_path}
    else:
        !unzip -qo {zip_path}
    os.remove(zip_path)

print("OK")

OK


**Step 2:** Dataset processing, part 1.

If this step fails, try the following:
* Make sure your filelists are correct. They should have relative 
paths that match the contents of the archive.

In [2]:

def fix_transcripts(inpath):
    found_arpabet = False
    found_grapheme = False
    with open(inpath, "r", encoding="utf8") as f:
        lines = f.readlines()
    with open(inpath, "w", encoding="utf8") as f:
        for l in lines:
            if l.strip() == "":
                continue
            if "{" in l:
                if not found_arpabet:
                    print("Warning: Skipping ARPABET lines (not supported).")
                    found_arpabet = True
            else:
                f.write(l)
                found_grapheme = True
    assert found_grapheme, "No non-ARPABET lines found in " + inpath

def generate_json(inpath, outpath):
    output = ""
    sample_rate = 22050
    with open(inpath, "r", encoding="utf8") as f:
        for l in f.readlines():
            lpath = l.split("|")[0].strip() + ".wav"
            if lpath[:5] != "wavs/":
                lpath = "wavs/" + lpath
            lpath = os.path.join(dataset_path, lpath)
            size = os.stat(lpath).st_size
            x = {
                "audio_filepath": lpath,
                "duration": size / (sample_rate * 2),
                "text": l.split("|")[1].strip(),
            }
            output += json.dumps(x) + "\n"
        with open(outpath, "w", encoding="utf8") as w:
            w.write(output)

def convert_to_22k(inpath):
    print("converting " + inpath)
    if inpath.strip()[-4:].lower() != ".wav":
        print("Warning: " + inpath.strip() + " is not a .wav file!")
        return
    ffmpeg.input(inpath).output(
        inpath + ".wav",
        ar="22050",
        ac="1",
        acodec="pcm_s16le",
        map_metadata="-1",
        fflags="+bitexact",
    ).overwrite_output().run(quiet=True)
    os.rename(inpath + "_22k.wav", inpath)


In [3]:
import json
import os
import shutil
# Extract dataset
os.chdir(output_dir)

if os.path.exists(os.path.join(dataset_path, "wavs", "wavs")):
    shutil.move(
        os.path.join(dataset_path, "wavs", "wavs"), 
        os.path.join(dataset_path, "tempwavs")
    )
    shutil.rmtree(os.path.join(dataset_path, "wavs"))
    shutil.move(
        os.path.join(dataset_path, "tempwavs"), 
        os.path.join(dataset_path, "wavs")
    )
# Filelist for preprocessing
os.chdir(output_dir)
shutil.copy(train_filelist, "trainfiles.txt")
shutil.copy(val_filelist, "valfiles.txt")
fix_transcripts("trainfiles.txt")
fix_transcripts("valfiles.txt")
seen_files = []
with open("trainfiles.txt") as f:
    t = f.read().split("\n")
with open("valfiles.txt") as f:
    v = f.read().split("\n")
    all_filelist = t[:] + v[:]
with open("allfiles.txt", "w") as f:
    for x in all_filelist:
        if x.strip() == "":
            continue
        if x.split("|")[0] not in seen_files:
            seen_files.append(x.split("|")[0])
            f.write(x.strip() + "\n")


In [4]:
from tqdm.notebook import tqdm

# Ensure audio is 22k
print("Converting audio...")
for r, _, f in os.walk(os.path.join("wavs")):
    for name in tqdm(f):
        convert_to_22k(os.path.join(r, name))

# Convert to JSON
generate_json("trainfiles.txt", "trainfiles.json")
generate_json("valfiles.txt", "valfiles.json")
generate_json("allfiles.txt", "allfiles.json")

print("OK")

Converting audio...
OK


**Step 3:** Dataset processing, part 2. This takes a while, but
you only have to run this once per dataset (results are saved to Drive).

If this step fails, try the following:
* Make sure your dataset only contains WAV files.

In [5]:
# Extract phoneme duration
import sys
import json
import torch
import torchaudio
import numpy as np
#from pysptk import sptk
from pathlib import Path
import ffmpeg
from nemo.collections.asr.models import EncDecCTCModel

asr_model = EncDecCTCModel.from_pretrained(model_name="asr_talknet_aligner").cpu().eval()

[NeMo W 2022-07-06 03:06:50 optimizers:47] Apex was not found. Using the lamb optimizer will error out.
    
[NeMo W 2022-07-06 03:06:52 experimental:27] Module <class 'nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.


[NeMo I 2022-07-06 03:06:52 cloud:56] Found existing object /home/beast/.cache/torch/NeMo/NeMo_1.0.2/qn5x5_libri_tts_phonemes/656c7439dd3a0d614978529371be498b/qn5x5_libri_tts_phonemes.nemo.
[NeMo I 2022-07-06 03:06:52 cloud:62] Re-using file from: /home/beast/.cache/torch/NeMo/NeMo_1.0.2/qn5x5_libri_tts_phonemes/656c7439dd3a0d614978529371be498b/qn5x5_libri_tts_phonemes.nemo
[NeMo I 2022-07-06 03:06:52 common:676] Instantiating model from pre-trained checkpoint


[NeMo W 2022-07-06 03:06:52 features:229] Using torch_stft is deprecated and will be removed in 1.1.0. Please set stft_conv and stft_exact_pad to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2022-07-06 03:06:52 features:252] PADDING: 1
[NeMo I 2022-07-06 03:06:52 features:262] STFT using conv


      fft_window = pad_center(fft_window, filter_length)
    
      librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float
    


[NeMo I 2022-07-06 03:06:55 modelPT:439] Model EncDecCTCModel was successfully restored from /home/beast/.cache/torch/NeMo/NeMo_1.0.2/qn5x5_libri_tts_phonemes/656c7439dd3a0d614978529371be498b/qn5x5_libri_tts_phonemes.nemo.


In [6]:
import nemo
def forward_extractor(tokens, log_probs, blank):
    """Computes states f and p."""
    n, m = len(tokens), log_probs.shape[0]
    # `f[s, t]` -- max sum of log probs for `s` first codes
    # with `t` first timesteps with ending in `tokens[s]`.
    f = np.empty((n + 1, m + 1), dtype=float)
    f.fill(-(10 ** 9))
    p = np.empty((n + 1, m + 1), dtype=int)
    f[0, 0] = 0.0  # Start
    for s in range(1, n + 1):
        c = tokens[s - 1]
        for t in range((s + 1) // 2, m + 1):
            f[s, t] = log_probs[t - 1, c]
            # Option #1: prev char is equal to current one.
            if s == 1 or c == blank or c == tokens[s - 3]:
                options = f[s : (s - 2 if s > 1 else None) : -1, t - 1]
            else:  # Is not equal to current one.
                options = f[s : (s - 3 if s > 2 else None) : -1, t - 1]
            f[s, t] += np.max(options)
            p[s, t] = np.argmax(options)
    return f, p


def backward_extractor(f, p):
    """Computes durs from f and p."""
    n, m = f.shape
    n -= 1
    m -= 1
    durs = np.zeros(n, dtype=int)
    if f[-1, -1] >= f[-2, -1]:
        s, t = n, m
    else:
        s, t = n - 1, m
    while s > 0:
        durs[s - 1] += 1
        s -= p[s, t]
        t -= 1
    assert durs.shape[0] == n
    assert np.sum(durs) == m
    assert np.all(durs[1::2] > 0)
    return durs

def preprocess_tokens(tokens, blank):
    new_tokens = [blank]
    for c in tokens:
        new_tokens.extend([c, blank])
    tokens = new_tokens
    return tokens

data_config = {
    'manifest_filepath': "allfiles.json",
    'sample_rate': 22050,
    'labels': asr_model.decoder.vocabulary,
    'batch_size': 1,
}

parser = nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset.make_vocab(
    notation='phonemes', punct=True, spaces=True, stresses=False, add_blank_at="last"
)

dataset = nemo.collections.asr.data.audio_to_text._AudioTextDataset(
    manifest_filepath=data_config['manifest_filepath'], sample_rate=data_config['sample_rate'], parser=parser,
)

dl = torch.utils.data.DataLoader(
    dataset=dataset, batch_size=data_config['batch_size'], collate_fn=dataset.collate_fn, shuffle=False,
)

blank_id = asr_model.decoder.num_classes_with_blank - 1

if os.path.exists(os.path.join(output_dir, "durations.pt")):
    print("durations.pt already exists; skipping")
else:
    dur_data = {}
    for sample_idx, test_sample in tqdm(enumerate(dl), total=len(dl)):
        log_probs, _, greedy_predictions = asr_model(
            input_signal=test_sample[0], input_signal_length=test_sample[1]
        )

        log_probs = log_probs[0].cpu().detach().numpy()
        seq_ids = test_sample[2][0].cpu().detach().numpy()

        target_tokens = preprocess_tokens(seq_ids, blank_id)

        f, p = forward_extractor(target_tokens, log_probs, blank_id)
        durs = backward_extractor(f, p)

        print ("dl.dataset is")
        print (vars(dl.dataset))
        print ("dl is")
        print (vars(dl))

        dur_key = Path(dl.dataset.collection[sample_idx].audio_file).stem
        dur_data[dur_key] = {
            'blanks': torch.tensor(durs[::2], dtype=torch.long).cpu().detach(), 
            'tokens': torch.tensor(durs[1::2], dtype=torch.long).cpu().detach()
        }

        del test_sample

    torch.save(dur_data, os.path.join(output_dir, "durations.pt"))


[NeMo I 2022-07-06 03:06:56 collections:173] Dataset loaded with 95 files totalling 0.29 hours
[NeMo I 2022-07-06 03:06:56 collections:174] 0 files were filtered totalling 0.00 hours


  0%|          | 0/95 [00:00<?, ?it/s]

dl.dataset is
{'parser': <nemo.collections.asr.data.vocabs.Phonemes object at 0x7f76099fc040>, 'collection': [AudioTextEntity(id=0, audio_file='/home/beast/Documents/GitHub/AutomaticTikTalk/example/wavs/example-753.wav', duration=10.363219954648526, text_tokens=[34, 18, 16, 0, 12, 36, 0, 28, 15, 3, 27, 12, 31, 15, 35, 0, 15, 27, 5, 0, 8, 31, 11, 0, 5, 15, 26, 8, 27, 10, 0, 16, 32, 5, 27, 16, 0, 10, 33, 32, 0, 1, 10, 26, 9, 0, 20, 33, 12, 34, 13], offset=None, text_raw="it's no ordinary rough gem fragile surface layer black veining", speaker=None, orig_sr=None), AudioTextEntity(id=1, audio_file='/home/beast/Documents/GitHub/AutomaticTikTalk/example/wavs/example-167.wav', duration=11.974104308390023, text_tokens=[27, 0, 9, 21, 31, 16, 2, 27, 12, 0, 3, 39, 0, 22, 39, 0, 7, 26, 20, 0, 31, 12, 35, 0, 11, 28, 15, 27, 0, 25, 12, 0, 22, 39], offset=None, text_raw='a question do you have any mora on you', speaker=None, orig_sr=None), AudioTextEntity(id=2, audio_file='/home/beast/Documents/GitHu

In [7]:

#Extract F0 (pitch)
import crepe
from scipy.io import wavfile

def crepe_f0(audio_file, hop_length=256):
    sr, audio = wavfile.read(audio_file)
    audio_x = np.arange(0, len(audio)) / 22050.0
    time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True)

    x = np.arange(0, len(audio), hop_length) / 22050.0
    freq_interp = np.interp(x, time, frequency)
    conf_interp = np.interp(x, time, confidence)
    audio_interp = np.interp(x, audio_x, np.absolute(audio)) / 32768.0
    weights = [0.5, 0.25, 0.25]
    audio_smooth = np.convolve(audio_interp, np.array(weights)[::-1], "same")

    conf_threshold = 0.25
    audio_threshold = 0.0005
    for i in range(len(freq_interp)):
        if conf_interp[i] < conf_threshold:
            freq_interp[i] = 0.0
        if audio_smooth[i] < audio_threshold:
            freq_interp[i] = 0.0

    # Hack to make f0 and mel lengths equal
    if len(audio) % hop_length == 0:
        freq_interp = np.pad(freq_interp, pad_width=[0, 1])
    return torch.from_numpy(freq_interp.astype(np.float32))

if os.path.exists(os.path.join(output_dir, "f0s.pt")):
    print("f0s.pt already exists; skipping")
else:
    f0_data = {}
    with open("allfiles.json") as f:
        for i, l in enumerate(f.readlines()):
            print(str(i))
            audio_path = json.loads(l)["audio_filepath"]
            f0_data[Path(audio_path).stem] = crepe_f0(audio_path)

    # calculate f0 stats (mean & std) only for train set
    with open("trainfiles.json") as f:
        train_ids = {Path(json.loads(l)["audio_filepath"]).stem for l in f}
    all_f0 = torch.cat([f0[f0 >= 1e-5] for f0_id, f0 in f0_data.items() if f0_id in train_ids])

    F0_MEAN, F0_STD = all_f0.mean().item(), all_f0.std().item()        
    print("F0_MEAN: " + str(F0_MEAN) + ", F0_STD: " + str(F0_STD))
    torch.save(f0_data, os.path.join(output_dir, "f0s.pt"))
    with open(os.path.join(output_dir, "f0_info.json"), "w") as f:
        f.write(json.dumps({"FO_MEAN": F0_MEAN, "F0_STD": F0_STD}))

print("OK")

0


2022-07-06 03:07:56.005604: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-06 03:07:56.005628: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-07-06 03:07:57.026002: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-07-06 03:07:57.026119: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-07-06 03:07:57.026293: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-06 03:07:57.026463: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35


FileNotFoundError: [Errno 2] No such file or directory: '/home/beast/Documents/GitHub/AutomaticTikTalk/example/wavs/example-447.wav'

**Step 4:** Train duration predictor.

If CUDA runs out of memory, try the following:
* Click on Kernel -> Restart, re-run step 1, and try again.
* If that doesn't help, reduce the batch size (default 64).

In [None]:
batch_size = 64

import os
from hydra.experimental import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
import pytorch_lightning as pl
from nemo.collections.common.callbacks import LogEpochTimeCallback
from nemo.collections.tts.models import TalkNetDursModel
from nemo.core.config import hydra_runner

epochs = 20
learning_rate = 1e-3
min_learning_rate = 3e-6
load_checkpoints = True

def train(cfg):
    cfg.sample_rate = 22050
    cfg.train_dataset = os.path.join(output_dir, "trainfiles.json")
    cfg.validation_datasets = os.path.join(output_dir, "valfiles.json")
    cfg.durs_file = os.path.join(output_dir, "durations.pt")
    cfg.f0_file = os.path.join(output_dir, "f0s.pt")
    cfg.trainer.accelerator = "dp"
    cfg.trainer.max_epochs = epochs
    cfg.trainer.check_val_every_n_epoch = 5
    cfg.model.train_ds.dataloader_params.batch_size = batch_size
    cfg.model.validation_ds.dataloader_params.batch_size = batch_size
    cfg.model.optim.lr = learning_rate
    cfg.model.optim.sched.min_lr = min_learning_rate
    cfg.exp_manager.exp_dir = output_dir

    # Find checkpoints
    ckpt_path = ""
    if load_checkpoints:
      path0 = os.path.join(output_dir, "TalkNetDurs")
      if os.path.exists(path0):
          path1 = sorted(os.listdir(path0))
          for i in range(len(path1)):
              path2 = os.path.join(path0, path1[-(1+i)], "checkpoints")
              if os.path.exists(path2):
                  match = [x for x in os.listdir(path2) if "last.ckpt" in x]
                  if len(match) > 0:
                      ckpt_path = os.path.join(path2, match[0])
                      print("Resuming training from " + match[0])
                      break
    
    if ckpt_path != "":
        trainer = pl.Trainer(**cfg.trainer, resume_from_checkpoint = ckpt_path)
        model = TalkNetDursModel(cfg=cfg.model, trainer=trainer)
    else:
        warmstart_path = os.path.join(cwd, "talknet_durs.nemo")
        trainer = pl.Trainer(**cfg.trainer)
        model = TalkNetDursModel.restore_from(warmstart_path, override_config_path=cfg)
        model.set_trainer(trainer)
        model.setup_training_data(cfg.model.train_ds)
        model.setup_validation_data(cfg.model.validation_ds)
        model.setup_optimization(cfg.model.optim)
        print("Warm-starting from " + warmstart_path)
    exp_manager(trainer, cfg.get('exp_manager', None))
    trainer.callbacks.extend([pl.callbacks.LearningRateMonitor(), LogEpochTimeCallback()])  # noqa
    trainer.fit(model)

os.chdir(cwd)
GlobalHydra().clear()
initialize(config_path="conf")
cfg = compose(config_name="talknet-durs")
train(cfg)


    
    The version_base parameter is not specified.
    Please specify a compatability version level, or None.
    Will assume defaults for version 1.1
      self.delegate = real_initialize(
    
    
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
[NeMo W 2022-07-06 02:42:57 modelPT:138] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset
      manifest_filepath: /home/beast/Documents/GitHub/AutomaticTikTalk/trainfiles.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      load_audio: false
      normalize: false
      sample_rate: 22050
      trim: false
      durs_file: /home/beast/Documents/GitHub/AutomaticTikTalk/durations.pt
      f0_file: /home/beast/Documents/GitHub/AutomaticTikTalk/f0s.pt
  

[NeMo I 2022-07-06 02:42:58 modelPT:439] Model TalkNetDursModel was successfully restored from /home/beast/Documents/GitHub/AutomaticTikTalk/talknet_durs.nemo.
[NeMo I 2022-07-06 02:42:58 collections:173] Dataset loaded with 87 files totalling 0.27 hours
[NeMo I 2022-07-06 02:42:58 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-07-06 02:42:58 collections:173] Dataset loaded with 8 files totalling 0.02 hours
[NeMo I 2022-07-06 02:42:58 collections:174] 0 files were filtered totalling 0.00 hours


[NeMo W 2022-07-06 02:42:58 modelPT:660] The lightning trainer received accelerator: dp. We recommend to use 'ddp' instead.


[NeMo I 2022-07-06 02:42:58 modelPT:751] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: (0.9, 0.999)
        eps: 1e-08
        lr: 0.001
        weight_decay: 1e-06
    )
[NeMo I 2022-07-06 02:42:58 lr_scheduler:621] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f7961c6dfa0>" 
    will be used during training (effective maximum steps = 40) - 
    Parameters : 
    (min_lr: 3.0e-06
    warmup_ratio: 0.02
    max_steps: 40
    )
Warm-starting from /home/beast/Documents/GitHub/AutomaticTikTalk/talknet_durs.nemo
[NeMo I 2022-07-06 02:42:58 exp_manager:216] Experiments will be logged at /home/beast/Documents/GitHub/AutomaticTikTalk/TalkNetDurs/2022-07-06_02-30-09
[NeMo I 2022-07-06 02:42:58 exp_manager:563] TensorboardLogger has been set up


      rank_zero_deprecation(
    


FileNotFoundError: [Errno 2] No such file or directory: 'git'

**Step 5:** Train pitch predictor.

If CUDA runs out of memory, try the following:
* Click on Kernel -> Restart, re-run step 1, and try again.
* If that doesn't help, reduce the batch size (default 64).

In [None]:
batch_size = 64
epochs = 50

import json

with open(os.path.join(output_dir, "f0_info.json"), "r") as f:
    f0_info = json.load(f)
    f0_mean = f0_info["FO_MEAN"]
    f0_std = f0_info["F0_STD"]

learning_rate = 1e-3
min_learning_rate = 3e-6
load_checkpoints = True

import os
from hydra.experimental import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
import pytorch_lightning as pl
from nemo.collections.common.callbacks import LogEpochTimeCallback
from nemo.collections.tts.models import TalkNetPitchModel
from nemo.core.config import hydra_runner

def train(cfg):
    cfg.sample_rate = 22050
    cfg.train_dataset = os.path.join(output_dir, "trainfiles.json")
    cfg.validation_datasets = os.path.join(output_dir, "valfiles.json")
    cfg.durs_file = os.path.join(output_dir, "durations.pt")
    cfg.f0_file = os.path.join(output_dir, "f0s.pt")
    cfg.trainer.accelerator = "dp"
    cfg.trainer.max_epochs = epochs
    cfg.trainer.check_val_every_n_epoch = 5
    cfg.model.f0_mean=f0_mean
    cfg.model.f0_std=f0_std
    cfg.model.train_ds.dataloader_params.batch_size = batch_size
    cfg.model.validation_ds.dataloader_params.batch_size = batch_size
    cfg.model.optim.lr = learning_rate
    cfg.model.optim.sched.min_lr = min_learning_rate
    cfg.exp_manager.exp_dir = output_dir

    # Find checkpoints
    ckpt_path = ""
    if load_checkpoints:
      path0 = os.path.join(output_dir, "TalkNetPitch")
      if os.path.exists(path0):
          path1 = sorted(os.listdir(path0))
          for i in range(len(path1)):
              path2 = os.path.join(path0, path1[-(1+i)], "checkpoints")
              if os.path.exists(path2):
                  match = [x for x in os.listdir(path2) if "last.ckpt" in x]
                  if len(match) > 0:
                      ckpt_path = os.path.join(path2, match[0])
                      print("Resuming training from " + match[0])
                      break
    
    if ckpt_path != "":
        trainer = pl.Trainer(**cfg.trainer, resume_from_checkpoint = ckpt_path)
        model = TalkNetPitchModel(cfg=cfg.model, trainer=trainer)
    else:
        warmstart_path = os.path.join(cwd, "talknet_pitch.nemo")
        trainer = pl.Trainer(**cfg.trainer)
        model = TalkNetPitchModel.restore_from(warmstart_path, override_config_path=cfg)
        model.set_trainer(trainer)
        model.setup_training_data(cfg.model.train_ds)
        model.setup_validation_data(cfg.model.validation_ds)
        model.setup_optimization(cfg.model.optim)
        print("Warm-starting from " + warmstart_path)
    exp_manager(trainer, cfg.get('exp_manager', None))
    trainer.callbacks.extend([pl.callbacks.LearningRateMonitor(), LogEpochTimeCallback()])  # noqa
    trainer.fit(model)

os.chdir(cwd)
GlobalHydra().clear()
initialize(config_path="conf")
cfg = compose(config_name="talknet-pitch")
train(cfg)

In [None]:
# Hack to get exp_manager working
import os
import subprocess
import sys
import time
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from shutil import copy, move
from typing import Any, Dict, List, Optional, Union

import torch
from hydra.core.hydra_config import HydraConfig
from hydra.utils import get_original_cwd
from omegaconf import DictConfig, OmegaConf, open_dict
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.utilities import rank_zero_only

from nemo.constants import NEMO_ENV_VARNAME_VERSION
from nemo.utils import app_state, logging
from nemo.utils.app_state import AppState
from nemo.utils.exceptions import NeMoBaseException
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.lightning_logger_patch import add_filehandlers_to_pl_logger


class NotFoundError(NeMoBaseException):
    """ Raised when a file or folder is not found"""


class LoggerMisconfigurationError(NeMoBaseException):
    """ Raised when a mismatch between trainer.logger and exp_manager occurs"""

    def __init__(self, message):
        message = (
            message
            + " You can disable lighning's trainer from creating a logger by passing logger=False to its constructor."
        )
        super().__init__(message)


class CheckpointMisconfigurationError(NeMoBaseException):
    """ Raised when a mismatch between trainer.callbacks and exp_manager occurs"""


@dataclass
class CallbackParams:
    filepath: Optional[str] = None  # Deprecated
    dirpath: Optional[str] = None  # If None, exp_manager will attempt to handle the filepath
    filename: Optional[str] = None  # If None, exp_manager will attempt to handle the filepath
    monitor: Optional[str] = "val_loss"
    verbose: Optional[bool] = True
    save_last: Optional[bool] = True
    save_top_k: Optional[int] = 3
    save_weights_only: Optional[bool] = False
    mode: Optional[str] = "min"
    period: Optional[int] = 1
    prefix: Optional[str] = None  # If None, exp_manager will attempt to handle the filepath
    postfix: str = ".nemo"
    save_best_model: bool = False
    always_save_nemo: bool = False


@dataclass
class ExpManagerConfig:
    # Log dir creation parameters
    explicit_log_dir: Optional[str] = None
    exp_dir: Optional[str] = None
    name: Optional[str] = None
    version: Optional[str] = None
    use_datetime_version: Optional[bool] = True
    resume_if_exists: Optional[bool] = False
    resume_past_end: Optional[bool] = False
    resume_ignore_no_checkpoint: Optional[bool] = False
    # Logging parameters
    create_tensorboard_logger: Optional[bool] = True
    summary_writer_kwargs: Optional[Dict[Any, Any]] = None
    create_wandb_logger: Optional[bool] = False
    wandb_logger_kwargs: Optional[Dict[Any, Any]] = None
    # Checkpointing parameters
    create_checkpoint_callback: Optional[bool] = True
    checkpoint_callback_params: Optional[CallbackParams] = CallbackParams()
    # Additional exp_manager arguments
    files_to_copy: Optional[List[str]] = None


def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Path:
    """
    exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm
    of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will get exp_dir,
    name, and version from the logger. Otherwise it will use the exp_dir and name arguments to create the logging
    directory. exp_manager also allows for explicit folder creation via explicit_log_dir.
    The version can be a datetime string or an integer. Datestime version can be disabled if use_datetime_version is set
     to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning.
    It copies sys.argv, and git information if available to the logging directory. It creates a log file for each
    process to log their output into.
    exp_manager additionally has a resume feature (resume_if_exists) which can be used to continuing training from
    the constructed log_dir. When you need to continue the training repeatedly (like on a cluster which you need
    multiple consecutive jobs), you need to avoid creating the version folders. Therefore from v1.0.0, when
    resume_if_exists is set to True, creating the version folders is ignored.
    Args:
        trainer (pytorch_lightning.Trainer): The lightning trainer.
        cfg (DictConfig, dict): Can have the following keys:
            - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder creation. Defaults to
                None, which will use exp_dir, name, and version to construct the logging directory.
            - exp_dir (str, Path): The base directory to create the logging directory. Defaults to None, which logs to
                ./nemo_experiments.
            - name (str): The name of the experiment. Defaults to None which turns into "default" via name = name or
                "default".
            - version (str): The version of the experiment. Defaults to None which uses either a datetime string or
                lightning's TensorboardLogger system of using version_{int}.
            - use_datetime_version (bool): Whether to use a datetime string for version. Defaults to True.
            - resume_if_exists (bool): Whether this experiment is resuming from a previous run. If True, it sets
                trainer.resume_from_checkpoint so that the trainer should auto-resume. exp_manager will move files
                under log_dir to log_dir/run_{int}. Defaults to False. From v1.0.0, when resume_if_exists is True,
                we would not create version folders to make it easier to find the log folder for next runs.
            - resume_past_end (bool): exp_manager errors out if resume_if_exists is True and a checkpoint matching
                *end.ckpt indicating a previous training run fully completed. This behaviour can be disabled, in which
                case the *end.ckpt will be loaded by setting resume_past_end to True. Defaults to False.
            - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint
                could be found. This behaviour can be disabled, in which case exp_manager will print a message and
                continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False.
            - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch
                lightning trainer. Defaults to True.
            - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger
                class. Note that log_dir is passed by exp_manager and cannot exist in this dict. Defaults to None.
            - create_wandb_logger (bool): Whether to create a Weights and Baises logger and attach it to the pytorch
                lightning trainer. Defaults to False.
            - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger
                class. Note that name and project are required parameters if create_wandb_logger is True.
                Defaults to None.
            - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the
                pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most
                recent checkpoint under *last.ckpt, and the final checkpoint after training completes under *end.ckpt.
                Defaults to True.
            - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which
                copies no files.
    returns:
        log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of
            exp_dir, name, and version.
    """
    # Add rank information to logger
    # Note: trainer.global_rank and trainer.is_global_zero are not set until trainer.fit, so have to hack around it
    global_rank = trainer.node_rank * trainer.num_gpus + int(os.environ.get("LOCAL_RANK", 0))
    logging.rank = global_rank

    if cfg is None:
        logging.error("exp_manager did not receive a cfg argument. It will be disabled.")
        return
    if trainer.fast_dev_run:
        logging.info("Trainer was called with fast_dev_run. exp_manager will return without any functionality.")
        return

    # Ensure passed cfg is compliant with ExpManagerConfig
    schema = OmegaConf.structured(ExpManagerConfig)
    if isinstance(cfg, dict):
        cfg = OmegaConf.create(cfg)
    elif not isinstance(cfg, DictConfig):
        raise ValueError(f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig")
    cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True))
    cfg = OmegaConf.merge(schema, cfg)

    error_checks(trainer, cfg)  # Ensures that trainer options are compliant with NeMo and exp_manager arguments

    log_dir, exp_dir, name, version = get_log_dir(
        trainer=trainer,
        exp_dir=cfg.exp_dir,
        name=cfg.name,
        version=cfg.version,
        explicit_log_dir=cfg.explicit_log_dir,
        use_datetime_version=cfg.use_datetime_version,
        resume_if_exists=cfg.resume_if_exists,
    )

    if cfg.resume_if_exists:
        check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint)

    checkpoint_name = name
    # If name returned from get_log_dir is "", use cfg.name for checkpointing
    if checkpoint_name is None or checkpoint_name == '':
        checkpoint_name = cfg.name or "default"
    cfg.name = name  # Used for configure_loggers so that the log_dir is properly set even if name is ""
    cfg.version = version

    # update app_state with log_dir, exp_dir, etc
    app_state = AppState()
    app_state.log_dir = log_dir
    app_state.exp_dir = exp_dir
    app_state.name = name
    app_state.version = version
    app_state.checkpoint_name = checkpoint_name
    app_state.create_checkpoint_callback = cfg.create_checkpoint_callback
    app_state.checkpoint_callback_params = cfg.checkpoint_callback_params

    # Create the logging directory if it does not exist
    os.makedirs(log_dir, exist_ok=True)  # Cannot limit creation to global zero as all ranks write to own log file
    logging.info(f'Experiments will be logged at {log_dir}')
    trainer._default_root_dir = log_dir

    # Handle Loggers by creating file and handle DEBUG statements
    log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{int(os.environ.get("LOCAL_RANK", 0))}.txt'
    logging.add_file_handler(log_file)

    # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks
    # not just global rank 0.
    if cfg.create_tensorboard_logger or cfg.create_wandb_logger:
        configure_loggers(
            trainer,
            exp_dir,
            cfg.name,
            cfg.version,
            cfg.create_tensorboard_logger,
            cfg.summary_writer_kwargs,
            cfg.create_wandb_logger,
            cfg.wandb_logger_kwargs,
        )

    if cfg.create_checkpoint_callback:
        configure_checkpointing(trainer, log_dir, checkpoint_name, cfg.checkpoint_callback_params)

    if is_global_rank_zero():
        # Move files_to_copy to folder and add git information if present
        if cfg.files_to_copy:
            for _file in cfg.files_to_copy:
                copy(Path(_file), log_dir)

        # Create files for cmd args and git info
        with open(log_dir / 'cmd-args.log', 'w') as _file:
            _file.write(" ".join(sys.argv))

        # Add err_file logging to global_rank zero
        logging.add_err_file_handler(log_dir / 'nemo_error_log.txt')

        # Add lightning file logging to global_rank zero
        add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt', log_dir / 'nemo_error_log.txt')

    return log_dir


def error_checks(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None):
    """
    Checks that the passed trainer is compliant with NeMo and exp_manager's passed configuration. Checks that:
        - Throws error when hydra has changed the working directory. This causes issues with lightning's DDP
        - Throws error when trainer has loggers defined but create_tensorboard_logger or create_WandB_logger is True
        - Prints error messages when 1) run on multi-node and not Slurm, and 2) run on multi-gpu without DDP
    """
    if HydraConfig.initialized() and get_original_cwd() != os.getcwd():
        raise ValueError(
            "Hydra changed the working directory. This interferes with ExpManger's functionality. Please pass "
            "hydra.run.dir=. to your python script."
        )
    if trainer.logger is not None and (cfg.create_tensorboard_logger or cfg.create_wandb_logger):
        raise LoggerMisconfigurationError(
            "The pytorch lightning trainer that was passed to exp_manager contained a logger, and either "
            f"create_tensorboard_logger: {cfg.create_tensorboard_logger} or create_wandb_logger: "
            f"{cfg.create_wandb_logger} was set to True. These can only be used if trainer does not already have a"
            " logger."
        )
    if trainer.num_nodes > 1 and not check_slurm(trainer):
        logging.error(
            "You are running multi-node training without SLURM handling the processes."
            " Please note that this is not tested in NeMo and could result in errors."
        )
    if trainer.num_gpus > 1 and not trainer.use_ddp:
        logging.error(
            "You are running multi-gpu without ddp.Please note that this is not tested in NeMo and could result in "
            "errors."
        )


def check_resume(
    trainer: 'pytorch_lightning.Trainer',
    log_dir: str,
    resume_past_end: bool = False,
    resume_ignore_no_checkpoint: bool = False,
):
    """Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets
    trainer.resume_from_checkpoint as necessary.
    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment
    Raises:
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    """
    if not log_dir:
        raise ValueError(f"Resuming requires the log_dir {log_dir} to be passed to exp_manager")

    checkpoint_dir = Path(Path(log_dir) / "checkpoints")

    checkpoint = None
    end_checkpoints = list(checkpoint_dir.rglob("*end.ckpt"))
    last_checkpoints = list(checkpoint_dir.rglob("*last.ckpt"))
    if not checkpoint_dir.exists():
        if resume_ignore_no_checkpoint:
            logging.warning(
                f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Training from scratch."
            )
            return
        else:
            raise NotFoundError(f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume.")
    elif len(end_checkpoints) > 0:
        if resume_past_end:
            if len(end_checkpoints) > 1:
                if 'mp_rank' in str(end_checkpoints[0]):
                    checkpoint = end_checkpoints[0]
                else:
                    raise ValueError(f"Multiple checkpoints {end_checkpoints} that matches *end.ckpt.")
            logging.info(f"Resuming from {end_checkpoints[0]}")
        else:
            raise ValueError(
                f"Found {end_checkpoints[0]} indicating that the last training run has already completed."
            )
    elif not len(last_checkpoints) > 0:
        if resume_ignore_no_checkpoint:
            logging.warning(f"There were no checkpoints found in {checkpoint_dir}. Training from scratch.")
            return
        else:
            raise NotFoundError(f"There were no checkpoints found in {checkpoint_dir}. Cannot resume.")
    elif len(last_checkpoints) > 1:
        if 'mp_rank' in str(last_checkpoints[0]):
            checkpoint = last_checkpoints[0]
        else:
            raise ValueError(f"Multiple checkpoints {last_checkpoints} that matches *last.ckpt.")
    else:
        logging.info(f"Resuming from {last_checkpoints[0]}")
        checkpoint = last_checkpoints[0]

    trainer.resume_from_checkpoint = str(checkpoint)

    if is_global_rank_zero():
        # Check to see if any files exist that need to be moved
        files_to_move = []
        for child in Path(log_dir).iterdir():
            if child.is_file():
                files_to_move.append(child)

        if len(files_to_move) > 0:
            # Move old files to a new folder
            other_run_dirs = Path(log_dir).glob("run_*")
            run_count = 0
            for fold in other_run_dirs:
                if fold.is_dir():
                    run_count += 1
            new_run_dir = Path(Path(log_dir) / f"run_{run_count}")
            new_run_dir.mkdir()
            for _file in files_to_move:
                move(str(_file), str(new_run_dir))


def check_explicit_log_dir(
    trainer: 'pytorch_lightning.Trainer', explicit_log_dir: [Path, str], exp_dir: str, name: str, version: str
) -> (Path, str, str, str):
    """ Checks that the passed arguments are compatible with explicit_log_dir.
    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment
    Raise:
        LoggerMisconfigurationError
    """
    if trainer.logger is not None:
        raise LoggerMisconfigurationError(
            "The pytorch lightning trainer that was passed to exp_manager contained a logger and explicit_log_dir: "
            f"{explicit_log_dir} was pass to exp_manager. Please remove the logger from the lightning trainer."
        )
    # Checking only (explicit_log_dir) vs (exp_dir and version).
    # The `name` will be used as the actual name of checkpoint/archive.
    if exp_dir or version:
        logging.error(
            f"exp_manager received explicit_log_dir: {explicit_log_dir} and at least one of exp_dir: {exp_dir}, "
            f"or version: {version}. Please note that exp_dir, name, and version will be ignored."
        )
    if is_global_rank_zero() and Path(explicit_log_dir).exists():
        logging.warning(f"Exp_manager is logging to {explicit_log_dir}, but it already exists.")
    return Path(explicit_log_dir), str(explicit_log_dir), "", ""


def get_log_dir(
    trainer: 'pytorch_lightning.Trainer',
    exp_dir: str = None,
    name: str = None,
    version: str = None,
    explicit_log_dir: str = None,
    use_datetime_version: bool = True,
    resume_if_exists: bool = False,
) -> (Path, str, str, str):
    """
    Obtains the log_dir used for exp_manager.
    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment
        explicit_log_dir (str): The explicit path to the log folder. Defaults to False.
        use_datetime_version (bool): Uses date and time as the version of the log folder. Defaults to True.
        resume_if_exists (bool): if resume_if_exists of the exp_manager's config is enabled or not. When enabled, the
            version folders would not get created.
    Raise:
        LoggerMisconfigurationError: If trainer is incompatible with arguments
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    """
    if explicit_log_dir:  # If explicit log_dir was passed, short circuit
        return check_explicit_log_dir(trainer, explicit_log_dir, exp_dir, name, version)

    # Default exp_dir to ./nemo_experiments if None was passed
    _exp_dir = exp_dir
    if exp_dir is None:
        _exp_dir = str(Path.cwd() / 'nemo_experiments')

    # If the user has already defined a logger for the trainer, use the logger defaults for logging directory
    if trainer.logger is not None:
        if trainer.logger.save_dir:
            if exp_dir:
                raise LoggerMisconfigurationError(
                    "The pytorch lightning trainer that was passed to exp_manager contained a logger, the logger's "
                    f"save_dir was not None, and exp_dir ({exp_dir}) was not None. If trainer.logger.save_dir "
                    "exists, exp_manager will use trainer.logger.save_dir as the logging directory and exp_dir "
                    "must be None."
                )
            _exp_dir = trainer.logger.save_dir
        if name:
            raise LoggerMisconfigurationError(
                "The pytorch lightning trainer that was passed to exp_manager contained a logger, and name: "
                f"{name} was also passed to exp_manager. If the trainer contains a "
                "logger, exp_manager will use trainer.logger.name, and name passed to exp_manager must be None."
            )
        name = trainer.logger.name
        version = f"version_{trainer.logger.version}"
    # Use user-defined exp_dir, project_name, exp_name, and versioning options
    else:
        name = name or "default"
        version = version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None)

        if not version:
            if resume_if_exists:
                logging.warning(
                    "No version folders would be created under the log folder as 'resume_if_exists' is enabled."
                )
                version = None
            elif is_global_rank_zero():
                if use_datetime_version:
                    version = time.strftime('%Y-%m-%d_%H-%M-%S')
                else:
                    tensorboard_logger = TensorBoardLogger(save_dir=Path(_exp_dir), name=name, version=version)
                    version = f"version_{tensorboard_logger.version}"
                os.environ[NEMO_ENV_VARNAME_VERSION] = "" if version is None else version

    log_dir = Path(_exp_dir) / Path(str(name)) / Path("" if version is None else str(version))
    return log_dir, str(_exp_dir), name, version


def get_git_hash():
    """
    Helper function that tries to get the commit hash if running inside a git folder
    returns:
        Bool: Whether the git subprocess ran without error
        str: git subprocess output or error message
    """
    try:
        return (
            True,
            subprocess.check_output(['git', 'rev-parse', 'HEAD'], stderr=subprocess.STDOUT).decode(),
        )
    except subprocess.CalledProcessError as err:
        return False, "{}\n".format(err.output.decode("utf-8"))


def get_git_diff():
    """
    Helper function that tries to get the git diff if running inside a git folder
    returns:
        Bool: Whether the git subprocess ran without error
        str: git subprocess output or error message
    """
    try:
        return subprocess.check_output(['git', 'diff'], stderr=subprocess.STDOUT).decode()
    except subprocess.CalledProcessError as err:
        return "{}\n".format(err.output.decode("utf-8"))


class LoggerList(_LoggerCollection):
    """ A thin wrapper on Lightning's LoggerCollection such that name and version are better aligned with exp_manager
    """

    def __init__(self, _logger_iterable, nemo_name=None, nemo_version=""):
        super().__init__(_logger_iterable)
        self._nemo_name = nemo_name
        self._nemo_version = nemo_version

    @property
    def name(self) -> str:
        return self._nemo_name

    @property
    def version(self) -> str:
        return self._nemo_version


def configure_loggers(
    trainer: 'pytorch_lightning.Trainer',
    exp_dir: [Path, str],
    name: str,
    version: str,
    create_tensorboard_logger: bool,
    summary_writer_kwargs: dict,
    create_wandb_logger: bool,
    wandb_kwargs: dict,
):
    """ Creates TensorboardLogger and/or WandBLogger and attach them to trainer. Raises ValueError if
    summary_writer_kwargs or wandb_kwargs are misconfigured.
    """
    # Potentially create tensorboard logger and/or WandBLogger
    logger_list = []
    if create_tensorboard_logger:
        if summary_writer_kwargs is None:
            summary_writer_kwargs = {}
        elif "log_dir" in summary_writer_kwargs:
            raise ValueError(
                "You cannot pass `log_dir` as part of `summary_writer_kwargs`. `log_dir` is handled by lightning's "
                "TensorBoardLogger logger."
            )
        tensorboard_logger = TensorBoardLogger(save_dir=exp_dir, name=name, version=version, **summary_writer_kwargs)
        logger_list.append(tensorboard_logger)
        logging.info("TensorboardLogger has been set up")

    if create_wandb_logger:
        if wandb_kwargs is None:
            wandb_kwargs = {}
        if "name" not in wandb_kwargs and "project" not in wandb_kwargs:
            raise ValueError("name and project are required for wandb_logger")
        wandb_logger = WandbLogger(save_dir=exp_dir, version=version, **wandb_kwargs)

        logger_list.append(wandb_logger)
        logging.info("WandBLogger has been set up")

    logger_list = (
        LoggerList(logger_list, nemo_name=name, nemo_version=version) if len(logger_list) > 1 else logger_list[0]
    )
    trainer.logger_connector.configure_logger(logger_list)


class NeMoModelCheckpoint(ModelCheckpoint):
    """ Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end
    """

    def __init__(self, always_save_nemo=False, save_best_model=False, postfix=".nemo", **kwargs):
        # Parse and store "extended" parameters: save_best model and postfix.
        self.always_save_nemo = always_save_nemo
        self.save_best_model = save_best_model
        self.postfix = postfix
        self.previous_best_path = ""

        # `prefix` is deprecated
        if 'prefix' in kwargs:
            self.prefix = kwargs.pop('prefix')
        else:
            self.prefix = ""

        # Call the parent class constructor with the remaining kwargs.
        super().__init__(**kwargs)

    @rank_zero_only
    def on_save_checkpoint(self, trainer, pl_module, checkpoint):
        output = super().on_save_checkpoint(trainer, pl_module, checkpoint)

        if not self.always_save_nemo:
            return output

        # Load the best model and then re-save it
        app_state = AppState()
        # since we are creating tarfile artifacts we need to update .nemo path
        app_state.model_restore_path = os.path.abspath(
            os.path.expanduser(os.path.join(self.dirpath, self.prefix + self.postfix))
        )
        if self.save_best_model:
            if not os.path.exists(self.best_model_path):
                return output

            if self.best_model_path == self.previous_best_path:
                return output

            self.previous_model_path = self.best_model_path
            old_state_dict = deepcopy(pl_module.state_dict())
            checkpoint = torch.load(self.best_model_path, map_location='cpu')
            if 'state_dict' in checkpoint:
                checkpoint = checkpoint['state_dict']
            # get a new instanace of the model
            pl_module.load_state_dict(checkpoint, strict=True)
            pl_module.save_to(save_path=app_state.model_restore_path)
            pl_module.load_state_dict(old_state_dict, strict=True)
        else:
            pl_module.save_to(save_path=app_state.model_restore_path)
        return output

    @rank_zero_only
    def on_train_end(self, trainer, pl_module):
        if trainer.fast_dev_run:
            return None
        app_state = AppState()
        if app_state.model_parallel_size is not None:
            return None

        # TODO: make this work for model parallel, need to call on data parallel rank 0 and update best_model_path
        # Load the best model and then re-save it
        if self.save_best_model:
            trainer.checkpoint_connector.restore(self.best_model_path, on_gpu=trainer.on_gpu)
        pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))


def configure_checkpointing(trainer: 'pytorch_lightning.Trainer', log_dir: Path, name: str, params: 'DictConfig'):
    """ Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint
    callback or if trainer.weights_save_path was passed to Trainer.
    """
    for callback in trainer.callbacks:
        if isinstance(callback, ModelCheckpoint):
            raise CheckpointMisconfigurationError(
                "The pytorch lightning trainer that was passed to exp_manager contained a ModelCheckpoint "
                "and create_checkpoint_callback was set to True. Please either set create_checkpoint_callback "
                "to False, or remove ModelCheckpoint from the lightning trainer"
            )
    if Path(trainer.weights_save_path) != Path.cwd():
        raise CheckpointMisconfigurationError(
            "The pytorch lightning was passed weights_save_path. This variable is ignored by exp_manager"
        )

    # Create the callback and attach it to trainer
    if "filepath" in params:
        if params.filepath is not None:
            logging.warning("filepath is deprecated. Please switch to dirpath and filename instead")
            if params.dirpath is None:
                params.dirpath = Path(params.filepath).parent
            if params.filename is None:
                params.filename = Path(params.filepath).name
        with open_dict(params):
            del params["filepath"]
    if params.dirpath is None:
        params.dirpath = Path(log_dir / 'checkpoints')
    if params.filename is None:
        params.filename = f'{name}--{{{params.monitor}:.2f}}-{{epoch}}'
    if params.prefix is None:
        params.prefix = name
    NeMoModelCheckpoint.CHECKPOINT_NAME_LAST = params.filename + '-last'

    logging.debug(params.dirpath)
    logging.debug(params.filename)
    logging.debug(params.prefix)

    if "val" in params.monitor:
        if (
            trainer.max_epochs is not None
            and trainer.max_epochs != -1
            and trainer.max_epochs < trainer.check_val_every_n_epoch
        ):
            logging.error(
                "The checkpoint callback was told to monitor a validation value but trainer.max_epochs("
                f"{trainer.max_epochs}) was less than trainer.check_val_every_n_epoch({trainer.check_val_every_n_epoch}"
                f"). It is very likely this run will fail with ModelCheckpoint(monitor='{params.monitor}') not found "
                "in the returned metrics. Please ensure that validation is run within trainer.max_epochs."
            )
        elif trainer.max_steps is not None:
            logging.warning(
                "The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to "
                f"{trainer.max_steps}. Please ensure that max_steps will run for at least "
                f"{trainer.check_val_every_n_epoch} epochs to ensure that checkpointing will not error out."
            )

    checkpoint_callback = NeMoModelCheckpoint(**params)
    checkpoint_callback.last_model_path = trainer.resume_from_checkpoint or ""
    trainer.callbacks.append(checkpoint_callback)


def check_slurm(trainer):
    try:
        return trainer.accelerator_connector.is_slurm_managing_tasks
    except AttributeError:
        return False

**Step 6:** Train spectrogram generator. 200+ epochs are recommended. 

This is the slowest of the three models to train, and the hardest to
get good results from. If your character sounds noisy or robotic,
try improving the dataset, or adjusting the epochs and learning rate.

If CUDA runs out of memory, try the following:
* Click on Kernel -> Restart, re-run step 1, and try again.
* If that doesn't help, reduce the batch size (default 32).

In [None]:
epochs = 200
batch_size = 32

# Advanced settings. You can probably leave these at their defaults (1e-3, 3e-6, empty, checked).
learning_rate = 1e-3
min_learning_rate = 3e-6
pretrained_path = ""
load_checkpoints = True

import os
from hydra.experimental import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
import pytorch_lightning as pl
from nemo.collections.common.callbacks import LogEpochTimeCallback
from nemo.collections.tts.models import TalkNetSpectModel
from nemo.core.config import hydra_runner

def train(cfg):
    cfg.sample_rate = 22050
    cfg.train_dataset = os.path.join(output_dir, "trainfiles.json")
    cfg.validation_datasets = os.path.join(output_dir, "valfiles.json")
    cfg.durs_file = os.path.join(output_dir, "durations.pt")
    cfg.f0_file = os.path.join(output_dir, "f0s.pt")
    cfg.trainer.accelerator = "dp"
    cfg.trainer.max_epochs = epochs
    cfg.trainer.check_val_every_n_epoch = 5
    cfg.model.train_ds.dataloader_params.batch_size = batch_size
    cfg.model.validation_ds.dataloader_params.batch_size = batch_size
    cfg.model.optim.lr = learning_rate
    cfg.model.optim.sched.min_lr = min_learning_rate
    cfg.exp_manager.exp_dir = output_dir

    # Find checkpoints
    ckpt_path = ""
    if load_checkpoints:
      path0 = os.path.join(output_dir, "TalkNetSpect")
      if os.path.exists(path0):
          path1 = sorted(os.listdir(path0))
          for i in range(len(path1)):
              path2 = os.path.join(path0, path1[-(1+i)], "checkpoints")
              if os.path.exists(path2):
                  match = [x for x in os.listdir(path2) if "last.ckpt" in x]
                  if len(match) > 0:
                      ckpt_path = os.path.join(path2, match[0])
                      print("Resuming training from " + match[0])
                      break
    
    if ckpt_path != "":
        trainer = pl.Trainer(**cfg.trainer, resume_from_checkpoint = ckpt_path)
        model = TalkNetSpectModel(cfg=cfg.model, trainer=trainer)
    else:
        if pretrained_path != "":
            warmstart_path = pretrained_path
        else:
            warmstart_path = os.path.join(cwd, "talknet_spect.nemo")
        trainer = pl.Trainer(**cfg.trainer)
        model = TalkNetSpectModel.restore_from(warmstart_path, override_config_path=cfg)
        model.set_trainer(trainer)
        model.setup_training_data(cfg.model.train_ds)
        model.setup_validation_data(cfg.model.validation_ds)
        model.setup_optimization(cfg.model.optim)
        print("Warm-starting from " + warmstart_path)
    exp_manager(trainer, cfg.get('exp_manager', None))
    trainer.callbacks.extend([pl.callbacks.LearningRateMonitor(), LogEpochTimeCallback()])  # noqa
    trainer.fit(model)

os.chdir(cwd)
GlobalHydra().clear()
initialize(config_path="conf")
cfg = compose(config_name="talknet-spect")
train(cfg)

    
    The version_base parameter is not specified.
    Please specify a compatability version level, or None.
    Will assume defaults for version 1.1
      self.delegate = real_initialize(
    
    
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
[NeMo W 2022-07-06 02:46:38 modelPT:138] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset
      manifest_filepath: /home/beast/Documents/GitHub/AutomaticTikTalk/trainfiles.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      load_audio: true
      normalize: false
      sample_rate: 22050
      trim: false
      durs_file: /home/beast/Documents/GitHub/AutomaticTikTalk/durations.pt
      f0_file: /home/beast/Documents/GitHub/AutomaticTikTalk/f0s.pt
   

[NeMo I 2022-07-06 02:46:38 features:252] PADDING: 1
[NeMo I 2022-07-06 02:46:38 features:269] STFT using torch


      librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float
    


[NeMo I 2022-07-06 02:46:39 modelPT:439] Model TalkNetSpectModel was successfully restored from /home/beast/Documents/GitHub/AutomaticTikTalk/talknet_spect.nemo.
[NeMo I 2022-07-06 02:46:39 collections:173] Dataset loaded with 87 files totalling 0.27 hours
[NeMo I 2022-07-06 02:46:39 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-07-06 02:46:39 collections:173] Dataset loaded with 8 files totalling 0.02 hours
[NeMo I 2022-07-06 02:46:39 collections:174] 0 files were filtered totalling 0.00 hours


[NeMo W 2022-07-06 02:46:39 modelPT:660] The lightning trainer received accelerator: dp. We recommend to use 'ddp' instead.


[NeMo I 2022-07-06 02:46:39 modelPT:751] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: (0.9, 0.999)
        eps: 1e-08
        lr: 0.001
        weight_decay: 1e-06
    )
[NeMo I 2022-07-06 02:46:39 lr_scheduler:621] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f7961c004f0>" 
    will be used during training (effective maximum steps = 600) - 
    Parameters : 
    (min_lr: 3.0e-06
    warmup_ratio: 0.02
    max_steps: 600
    )
Warm-starting from /home/beast/Documents/GitHub/AutomaticTikTalk/talknet_spect.nemo
[NeMo I 2022-07-06 02:46:39 3183045548:199] Experiments will be logged at /home/beast/Documents/GitHub/AutomaticTikTalk/TalkNetSpect/2022-07-06_02-30-09
[NeMo I 2022-07-06 02:46:39 3183045548:531] TensorboardLogger has been set up


      rank_zero_deprecation(
    
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
[NeMo W 2022-07-06 02:46:39 modelPT:660] The lightning trainer received accelerator: dp. We recommend to use 'ddp' instead.


[NeMo I 2022-07-06 02:46:39 modelPT:751] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: (0.9, 0.999)
        eps: 1e-08
        lr: 0.001
        weight_decay: 1e-06
    )
[NeMo I 2022-07-06 02:46:39 lr_scheduler:621] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f7961bdf670>" 
    will be used during training (effective maximum steps = 600) - 
    Parameters : 
    (min_lr: 3.0e-06
    warmup_ratio: 0.02
    max_steps: 600
    )



  | Name         | Type                              | Params
-------------------------------------------------------------------
0 | preprocessor | AudioToMelSpectrogramPreprocessor | 0     
1 | embed        | GaussianEmbedding                 | 7.6 K 
2 | norm_f0      | MaskedInstanceNorm1d              | 0     
3 | res_f0       | StyleResidual                     | 512   
4 | model        | ConvASREncoder                    | 8.7 M 
5 | proj         | Conv1d                            | 82.0 K
-------------------------------------------------------------------
8.7 M     Trainable params
0         Non-trainable params
8.7 M     Total params
34.986    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

      rank_zero_warn(
    
      value = torch.tensor(value, device=device, dtype=torch.float)
    


Training: 0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 74.00 MiB (GPU 0; 7.92 GiB total capacity; 5.91 GiB already allocated; 94.81 MiB free; 6.17 GiB reserved in total by PyTorch)

**Step 7:** Generate GTA spectrograms. This will help HiFi-GAN learn what your TalkNet model sounds like.

If this step fails, make sure you've finished training the spectrogram generator.

In [None]:
import sys
import os
import torch
import numpy as np
from tqdm import tqdm
from nemo.collections.tts.models import TalkNetSpectModel
import shutil

def fix_paths(inpath):
    output = ""
    with open(inpath, "r", encoding="utf8") as f:
        for l in f.readlines():
            if l[:5].lower() != "wavs/":
                output += "wavs/" + l
            else:
                output += l
    with open(inpath, "w", encoding="utf8") as w:
        w.write(output)

shutil.copyfile(train_filelist, os.path.join(cwd, "hifi-gan", "training.txt"))
shutil.copyfile(val_filelist, os.path.join(cwd, "hifi-gan", "validation.txt"))
fix_paths(os.path.join(cwd, "hifi-gan", "training.txt"))
fix_paths(os.path.join(cwd, "hifi-gan", "validation.txt"))
fix_paths(os.path.join(output_dir, "allfiles.txt"))

os.chdir(cwd)
outdir = os.path.join(dataset_path, "wavs")
if not os.path.exists(outdir):
    os.mkdir(outdir)

model_path = output_dir + "talknet_spect.nemo"
print("model_path"+ model_path)

dur_path = os.path.join(output_dir, "durations.pt")
f0_path = os.path.join(output_dir, "f0s.pt")

model = TalkNetSpectModel.restore_from(model_path)
model.eval()
with open(os.path.join(output_dir, "allfiles.txt"), "r", encoding="utf-8") as f:
    dataset = f.readlines()
durs = torch.load(dur_path)
f0s = torch.load(f0_path)

for x in tqdm(dataset):
    x_name = os.path.splitext(os.path.basename(x.split("|")[0].strip()))[0]
    x_tokens = model.parse(text=x.split("|")[1].strip())
    x_durs = (
        torch.stack(
            (
                durs[x_name]["blanks"],
                torch.cat((durs[x_name]["tokens"], torch.zeros(1).int())),
            ),
            dim=1,
        )
        .view(-1)[:-1]
        .view(1, -1)
        .to("cuda:0")
    )
    x_f0s = f0s[x_name].view(1, -1).to("cuda:0")
    x_spect = model.force_spectrogram(tokens=x_tokens, durs=x_durs, f0=x_f0s)
    rel_path = os.path.splitext(x.split("|")[0].strip())[0][5:]
    abs_dir = os.path.join(outdir, os.path.dirname(rel_path))
    if abs_dir != "" and not os.path.exists(abs_dir):
        os.makedirs(abs_dir, exist_ok=True)
    np.save(os.path.join(outdir, rel_path + ".npy"), x_spect.detach().cpu().numpy())


model_path/home/beast/Documents/GitHub/AutomaticTikTalk/talknet_spect.nemo


[NeMo W 2022-07-06 02:54:02 model_utils:391] Skipped conversion for config/subconfig:
    {'_target_': 'nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset', 'manifest_filepath': '???', 'max_duration': None, 'min_duration': 0.1, 'int_values': False, 'load_audio': True, 'normalize': False, 'sample_rate': 22050, 'trim': False, 'durs_file': '???', 'f0_file': '???', 'blanking': True, 'vocab': {'notation': 'phonemes', 'punct': True, 'spaces': True, 'stresses': False, 'add_blank_at': 'last'}}
     Reason: Missing mandatory value: train_ds.dataset.manifest_filepath
        full_key: train_ds.dataset.manifest_filepath
        object_type=dict.
[NeMo W 2022-07-06 02:54:02 model_utils:391] Skipped conversion for config/subconfig:
    {'_target_': 'nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset', 'manifest_filepath': '???', 'max_duration': None, 'min_duration': 0.1, 'int_values': False, 'load_audio': True, 'normalize': False, 'sample_rate': 22050, 'trim': Fa

[NeMo I 2022-07-06 02:54:02 features:252] PADDING: 1
[NeMo I 2022-07-06 02:54:02 features:269] STFT using torch


      librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float
    


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 7.92 GiB total capacity; 5.94 GiB already allocated; 54.31 MiB free; 6.19 GiB reserved in total by PyTorch)

**Step 8:** Train HiFi-GAN. 2,000+ steps are recommended.
Stop this cell to finish training the model.

* Click on Kernel -> Restart, re-run step 1, and try again.
If this step still fails to start, make sure step 7 finished successfully.

Note: If the training process starts at step 2500000, delete the HiFiGAN folder and try again.

In [None]:
import gdown
d = 'https://drive.google.com/uc?id='

os.chdir(os.path.join(cwd, "hifi-gan"))

hifi_train = os.path.join(cwd, "hifi-gan", "training.txt")
hifi_val = os.path.join(cwd, "hifi-gan", "validation.txt")
hifi_wavs = os.path.join(dataset_path)

if not os.path.exists(os.path.join(output_dir, "HiFiGAN")):
    os.makedirs(os.path.join(output_dir, "HiFiGAN"))
if not os.path.exists(os.path.join(output_dir, "HiFiGAN", "do_00000000")):
    print("Downloading universal model...")
    gdown.download(d+"1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW", os.path.join(output_dir, "HiFiGAN", "g_00000000"), quiet=False)
    gdown.download(d+"1O63eHZR9t1haCdRHQcEgMfMNxiOciSru", os.path.join(output_dir, "HiFiGAN", "do_00000000"), quiet=False)
    start_from_universal = "--warm_start True "
else:
    start_from_universal = ""

!python train.py --fine_tuning True --config config_v1b.json \
{start_from_universal} \
--checkpoint_interval 250 --checkpoint_path "{os.path.join(output_dir, 'HiFiGAN')}" \
--input_training_file "{hifi_train}" \
--input_validation_file "{hifi_val}" \
--input_wavs_dir "{hifi_wavs}"


Initializing Training Process..
Batch size per GPU : 8
Traceback (most recent call last):
  File "train.py", line 280, in <module>
    main()
  File "train.py", line 275, in main
    mp.spawn(train, nprocs=h.num_gpus, args=(a, h, a.warm_start,))
  File "/home/beast/miniconda3/envs/ctnemo/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/beast/miniconda3/envs/ctnemo/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
    while not context.join():
  File "/home/beast/miniconda3/envs/ctnemo/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 150, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/home/beast/miniconda3/envs/ctnemo/lib/python3.

**Step 9:** Package the models. They'll be saved to the output directory as [character_name]_TalkNet.zip.

When done, upload it to Google Drive, with permissions set to "Anyone with the link". 
You can then use it with TalkNet by selecting "Custom model" as your character. 

This cell will also delete the training checkpoints and logs.
That should free up roughly 2 GB of space.
If you wish to keep them, set delete_checkpoints to False.

In [None]:
character_name = "Character"

import shutil
from zipfile import ZipFile

def find_talknet(model_dir):
    ckpt_path = ""
    path0 = os.path.join(output_dir, model_dir)
    if os.path.exists(path0):
        path1 = sorted(os.listdir(path0))
        for i in range(len(path1)):
            path2 = os.path.join(path0, path1[-(1+i)], "checkpoints")
            if os.path.exists(path2):
                match = [x for x in os.listdir(path2) if ".nemo" in x]
                if len(match) > 0:
                    ckpt_path = os.path.join(path2, match[0])
                    break
    assert ckpt_path != "", "Couldn't find " + model_dir
    return ckpt_path

assert os.path.exists(os.path.join(output_dir, "HiFiGAN", "g_00000000")), "Couldn't find HiFi-GAN"

zip = ZipFile(os.path.join(output_dir, character_name + "_TalkNet.zip"), 'w')
zip.write(output_dir, "talknet_durs.nemo")
zip.write(output_dir, "talknet_pitch.nemo")
zip.write(output_dir, "talknet_spect.nemo")
zip.write(os.path.join(output_dir, "HiFiGAN", "g_00000000"), "hifiganmodel")
zip.write(os.path.join(output_dir, "HiFiGAN", "config.json"), "config.json")
zip.write(os.path.join(output_dir, "f0_info.json"), "f0_info.json")
zip.close()
print("Archived model to " + os.path.join(output_dir, character_name + "_TalkNet.zip"))


Archived model to /home/beast/Documents/GitHub/AutomaticTikTalk/Character_TalkNet.zip
