In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchaudio
import IPython
import sys
sys.path.append("..")
from dataset_generation.randomDataset import RandomDataset

from utils.cnn import CNNNetwork
from utils import LABELS
from utils.lstm import LSTMNetwork

ModuleNotFoundError: No module named 'torch'

In [None]:
## LOAD IN MODELS HERE
sparams = "7, input_shape=(1, 80, 313), layers=[16, 32, 64, 128], stride=(1,2), kernel=(5, 5), nfft=512, nmel=80"
style_model = CNNNetwork(7, input_shape=(1, 80, 313), layers=[16, 32, 64, 128], stride=(1,2), kernel=(5, 5))
style_model(torch.rand(64, 1, 60, 313))
style_model.load_state_dict(torch.load(sparams))
vgg = models.vgg11(pretrained=True)
content_model = torch.nn.Sequential(*list(vgg.children())[:-4])
style_model.eval()
content_model.eval()

melSpec = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, n_mels=80)
src = '../dataset_generation/denoised_speeches'
dataSet = RandomDataset(src, 16000, 10000, melSpec, 6, 5)

## Initialize our input
spec = dataSet[4][0]
spec.requires_grad = True

spec_original = spec.clone()
spec_original = torch.cat((spec_original, spec_original, spec_original)).unsqueeze(0)
with torch.no_grad():
    original_content = content_model(spec_original).squeeze()

voice = "Amos"
target_style = torch.zeros(1, 7)
target_style[0][LABELS.index(voice)] = 1



c_loss_fn = torch.nn.MSELoss()
s_loss_fn = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam([spec], lr=0.01)
num_iterations = 1000
for i in range(num_iterations):
    # Zero the gradients
    optim.zero_grad()

    # Get the content output from the ResNet model
    inp = torch.cat((spec, spec, spec)).unsqueeze(0)
    with torch.no_grad():
        content_output = content_model(inp).squeeze()

    # Calculate the content loss
    with torch.no_grad():
        content_loss = c_loss_fn(content_output, original_content)

    # get style output from speaker verif model
    current_style = style_model(spec.unsqueeze(0))
    style_loss = c_loss_fn(current_style, target_style)
    print(content_loss, style_loss)
    loss = 0.2*content_loss + 0.8*style_loss

    # Backpropagate the gradients
    loss.backward()

    # Update the input spectrogram
    optim.step()
    if i%10 == 0:
        print(f"Iteration {i + 1}/{num_iterations}: Loss = {loss.item()}")

In [None]:
## Decode the spectogram
bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
vocoder = bundle.get_vocoder()

with torch.inference_mode():
    waveforms, lengths = vocoder(spec, spec.shape[1])


IPython.display.Audio(waveforms[0:1].cpu(), rate=vocoder.sample_rate)