In [1]:
from google.colab import drive
drive.mount('/models')

Mounted at /models


In [2]:
import os
os.chdir('/models/MyDrive/AutoVC')

In [3]:
import pickle
import torch
import numpy as np
import soundfile as sf
from math import ceil
from collections import OrderedDict
from numpy.random import RandomState
from scipy import signal
from scipy.signal import get_window
from librosa.filters import mel

from model_vc import Generator
from model_bl import D_VECTOR

def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a
    
    
def pySTFT(x, fft_length=1024, hop_length=256):
    x = np.pad(x, int(fft_length//2), mode='reflect')
    
    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)
    
    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T
    
    return np.abs(result) 

def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad


In [4]:
wavsDir = './wavs'
spmelDir = './spmel'

Make Mel-Spectograms from WAVs

In [5]:
mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)

dirName, subdirList, _ = next(os.walk(wavsDir))
for subdir in sorted(subdirList):
    print(subdir)
    if not os.path.exists(os.path.join(spmelDir, subdir)):
        os.makedirs(os.path.join(spmelDir, subdir))
    _,_, fileList = next(os.walk(os.path.join(dirName,subdir)))
    prng = RandomState(int(subdir[1:])) 
    for fileName in sorted(fileList):
        # Read audio file
        x, fs = sf.read(os.path.join(dirName,subdir,fileName))
        # Remove drifting noise
        y = signal.filtfilt(b, a, x)
        # Ddd a little random noise for model roubstness
        wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
        # Compute spect
        D = pySTFT(wav).T
        # Convert to mel and normalize
        D_mel = np.dot(D, mel_basis)
        D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
        S = np.clip((D_db + 100) / 100, 0, 1)    
        # save spect    
        np.save(os.path.join(spmelDir, subdir, fileName[:-4]),
                S.astype(np.float32), allow_pickle=False)

p225
p226
p227
p228


Make Metadata

In [6]:
C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)

num_uttrs = 10
len_crop = 128

dirName, subdirList, _ = next(os.walk(spmelDir))
speakers = []
for speaker in sorted(subdirList):
    print('Processing speaker: %s' % speaker)
    utterances = []
    utterances.append(speaker)
    _, _, fileList = next(os.walk(os.path.join(dirName,speaker)))
    
    # make speaker embedding
    assert len(fileList) >= num_uttrs
    idx_uttrs = np.random.choice(len(fileList), size=num_uttrs, replace=False)
    embs = []
    for i in range(num_uttrs):
        tmp = np.load(os.path.join(dirName, speaker, fileList[idx_uttrs[i]]))
        candidates = np.delete(np.arange(len(fileList)), idx_uttrs)
        # choose another utterance if the current one is too short
        while tmp.shape[0] < len_crop:
            idx_alt = np.random.choice(candidates)
            tmp = np.load(os.path.join(dirName, speaker, fileList[idx_alt]))
            candidates = np.delete(candidates, np.argwhere(candidates==idx_alt))
        left = np.random.randint(0, tmp.shape[0]-len_crop)
        melsp = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
        emb = C(melsp)
        embs.append(emb.detach().squeeze().cpu().numpy())     
    utterances.append(np.mean(embs, axis=0))
    
    # create file list
    for fileName in sorted(fileList):
        utterances.append(os.path.join(speaker,fileName))
    speakers.append(utterances)

Processing speaker: p225
Processing speaker: p226
Processing speaker: p227
Processing speaker: p228


In [7]:
with open('metadata.pkl', 'wb') as handle:
    pickle.dump(speakers, handle)

Convert Mel-Spectograms

In [8]:
device = 'cuda:0'
G = Generator(32,256,512,32).eval().to(device)

g_checkpoint = torch.load('autovc.ckpt', map_location=device)
G.load_state_dict(g_checkpoint['model'])

metadata = pickle.load(open('metadata.pkl', "rb"))
spect_vc = []

for sbmt_i in metadata:
    #print(sbmt_i)
             
    x_org = sbmt_i[2]
    if isinstance(x_org, str):
        x_org = np.load(os.path.join(spmelDir, x_org))

    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
    
    for sbmt_j in metadata:
                   
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
        with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
        if len_pad == 0:
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )

In [9]:
with open('results.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)          

Run Vocoder

In [10]:
!pip install wavenet_vocoder
from synthesis import build_model
from synthesis import wavegen

Collecting wavenet_vocoder
  Downloading https://files.pythonhosted.org/packages/0a/da/47da119dbd3cfc0c80b75270e4bc7b49b678bd94600928fa243922ad65bc/wavenet_vocoder-0.1.1.tar.gz
Building wheels for collected packages: wavenet-vocoder
  Building wheel for wavenet-vocoder (setup.py) ... [?25l[?25hdone
  Created wheel for wavenet-vocoder: filename=wavenet_vocoder-0.1.1-cp37-none-any.whl size=12666 sha256=b6c97e1ea6b5535d3def3d07ebd85964e08d6d05027e002bf61d9d3b230c257c
  Stored in directory: /root/.cache/pip/wheels/72/fc/21/02d3785b65dd072b110b44b9df98b8cbf72a89ddea424ff0d9
Successfully built wavenet-vocoder
Installing collected packages: wavenet-vocoder
Successfully installed wavenet-vocoder-0.1.1


In [11]:
spect_vc = pickle.load(open('results.pkl', 'rb'))
device = torch.device("cuda")
model = build_model().to(device)

checkpoint = torch.load("checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])

<All keys matched successfully>

In [12]:
for spect in spect_vc:
    name = spect[0]
    c = spect[1]
    #print(name)
    waveform = wavegen(model, c=c)   
    sf.write(os.path.join(wavsDir, name+'.wav'), waveform, 16000)

100%|██████████| 96256/96256 [12:33<00:00, 127.72it/s]
100%|██████████| 96256/96256 [12:32<00:00, 127.94it/s]
100%|██████████| 96256/96256 [12:26<00:00, 128.86it/s]
100%|██████████| 96256/96256 [12:31<00:00, 128.10it/s]
100%|██████████| 109056/109056 [14:11<00:00, 128.03it/s]
100%|██████████| 109056/109056 [14:12<00:00, 127.99it/s]
100%|██████████| 109056/109056 [14:11<00:00, 128.11it/s]
100%|██████████| 109056/109056 [14:12<00:00, 127.93it/s]
100%|██████████| 112128/112128 [14:38<00:00, 127.71it/s]
100%|██████████| 112128/112128 [14:34<00:00, 128.26it/s]
100%|██████████| 112128/112128 [14:35<00:00, 128.00it/s]
100%|██████████| 112128/112128 [14:41<00:00, 127.14it/s]
100%|██████████| 119552/119552 [15:43<00:00, 126.65it/s]
100%|██████████| 119552/119552 [15:42<00:00, 126.81it/s]
100%|██████████| 119552/119552 [15:33<00:00, 128.08it/s]
100%|██████████| 119552/119552 [15:28<00:00, 128.75it/s]
