In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

In [None]:
from ponysynth.models.label_embeddings import *

# if we have excess gpu memory and compute, virtualize the gpu to increase the batch size
# if we have excess gpu compute, run multiple steps per batch

# treat gradient descent as a beam search
# if we have excess gpu memory, increase the beam size
# if we have excess gpu compute, increase the beam depth

archive_fn = '/home/celestia/data/audio-tfrecord/*'

# strategy = tf.distribute.OneDeviceStrategy(device='/gpu:0')
#strategy = tf.distribute.MirroredStrategy()
gpus = tf.config.experimental.list_logical_devices('GPU')
strategy = snt.distribute.Replicator(
    [f"/device:GPU:{i}" for i in range(len(gpus))],
    tf.distribute.ReductionToOneDevice("GPU:0")
)
print(f'using {len(gpus)} gpus')

replica_batch_size = 64
global_batch_size = replica_batch_size * len(gpus)


dataset = input_db(archive_fn, global_batch_size)
dataset = strategy.experimental_distribute_dataset(dataset)


with strategy.scope():
    opt = snt.optimizers.Adam(learning_rate=0.0004)
    model = VAEModel(SimpleVAE(30))
    
    def train_step(ids, weights):
        features = tf.sparse.to_dense(weights)
        features.set_shape((replica_batch_size, index_end - index_start))
        loss, grads, params = model.gradients(features)
        opt.apply(grads, params)
        return loss
        
    @tf.function
    def distributed_train_step(ids, weights):    
        losses = strategy.experimental_run_v2(train_step, args=(ids, weights))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, losses, axis=None) / global_batch_size
        
    for ids, weights in dataset:
        loss = distributed_train_step(ids, weights)
        print(loss)

# todo: test this with tpus
# todo: add loss mask for unknown values
# todo: train with bert-like mask
# todo: figure out what structure to use... convolution?
# todo: figure out how to generate embeddings
# todo: create a encoding -> 2d differential net for speech
# todo: write custom optimizer for metalearning
# THEN improve with xlnet, al-bert, etc



In [178]:
from ponysynth.corpus import *
from ponysynth.clipbot import ClipBot
from datapipes.audiorecord_out import *
from IPython import display as ipd
import numpy as np
import scipy

archive = ClipperArchive('/home/celestia/data/clipper-preproc/audio-tar/Burnt-Oak.tar')
extra = InfoArchive('/home/celestia/data/clipper-preproc/audio-info/Burnt-Oak.txz')

def find_cutoff_times(label, info):
#     return label['words'][0]['interval'][0], label['words'][-1]['interval'][-1]
    intensities = [x for x in info['intensity']]
    th = threshold([float(x['volume.db']) for x in intensities])
    
    for start in intensities:
        if float(start['volume.db']) >= th:
            break
    
    for end in intensities[::-1]:
        if float(end['volume.db']) >= th:
            break
    
    return float(start['time.sec']), float(end['time.sec'])


def fade(indexes):
    return indexes * 0

def threshold(intensities):
    mean = np.mean(intensities)
    median = np.median(intensities)
    reference = None

    if median > mean:
        reference = median
    else:
        reference = np.median(intensity[intensities > mean])

    return reference - 13




def apply_faded_trim(key):
    audio = archive.read_audio(key)
    samples, rate = librosa.core.load(audio, sr=None)
    label = archive.read_label(key)
    info = extra.read_info(key)
    
    clipbot = ClipBot(samples, rate)
    start_time, end_time = find_cutoff_times(label, info)
    print(start_time, end_time)
    
    start, mid, end = clipbot.split(start_time, end_time)
    start.mod.crossfade(pre=fade)
    end.mod.crossfade(post=fade)
    
    return clipbot.get_samples(), rate

import matplotlib.pyplot as plt

#for key in archive.sample(k=10):
for key in ('Burnt-Oak-outtakes:s7e13-2266.283789-2268.209568',):
    print(key)
    clean_samples, rate = apply_faded_trim(key)    
    unclean_file = archive.read_audio(key)
    unclean_samples, _ = librosa.core.load(unclean_file, sr=None)
    
    trim_diff = clean_samples - unclean_samples
    label = archive.read_label(key)
    
    trimbot = ClipBot(clean_samples, rate).first_word(label)
    diffbot = ClipBot(trim_diff, rate).first_word(label)
    clipbot = ClipBot(unclean_samples, rate).first_word(label)
    
    trimbot.draw.audio()
    
    clipbot.draw.figure()
    clipbot.draw.samples()
    clipbot.draw.show()

    
    diffbot.draw.figure()
    diffbot.draw.samples()
    diffbot.draw.show()

    
# TODO: move ClipDrawBot to DrawBot

Burnt-Oak-outtakes:s7e13-2266.283789-2268.209568


IndexError: boolean index did not match indexed array along dimension 0; dimension is 414 but corresponding boolean dimension is 233

In [165]:
%load_ext py_d3


UsageError: Line magic function `%%d3` not found.


In [167]:
%%d3

# edge cases:
# Burnt-Oak-outtakes:s7e13-2111.012246-2112.833525
# Burnt-Oak-outtakes:s7e13-2000.182908-2003.228327