# 演習：声質変換

## 環境構築

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd /content/drive/'My Drive'/AI_Experiment/speech

/content/drive/My Drive/AI_Experiment/speech


In [3]:
!pip3 install pyworld
!pip3 install pysptk
!pip3 install dtw

Collecting pyworld
[?25l  Downloading https://files.pythonhosted.org/packages/af/88/003eef396c966cf00088900167831946b80b8e7650843905cb9590c2d9ca/pyworld-0.2.12.tar.gz (222kB)
[K     |█▌                              | 10kB 21.9MB/s eta 0:00:01[K     |███                             | 20kB 28.6MB/s eta 0:00:01[K     |████▍                           | 30kB 33.7MB/s eta 0:00:01[K     |█████▉                          | 40kB 31.8MB/s eta 0:00:01[K     |███████▍                        | 51kB 33.5MB/s eta 0:00:01[K     |████████▉                       | 61kB 35.3MB/s eta 0:00:01[K     |██████████▎                     | 71kB 26.5MB/s eta 0:00:01[K     |███████████▊                    | 81kB 24.8MB/s eta 0:00:01[K     |█████████████▎                  | 92kB 26.1MB/s eta 0:00:01[K     |██████████████▊                 | 102kB 23.4MB/s eta 0:00:01[K     |████████████████▏               | 112kB 23.4MB/s eta 0:00:01[K     |█████████████████▋              | 122kB 23.4MB/s eta 

## 特徴量の分析

In [4]:
import os
import sys
import glob

from scipy.io import wavfile # for wavfile I/O
import pyworld as pw
import numpy as np
import pysptk as sptk

In [5]:
spklist = ["SF", "TF"]  # speaker list [source female speaker, target female speaker]
featlist = ["mgc","f0","ap"]

In [6]:
# Making directories for speech features
for s in spklist:
    for f in featlist:
        if not os.path.exists("data/{}/{}".format(s,f)):
            os.mkdir("data/{}/{}".format(s,f))

In [7]:
for s in spklist:
    wavlist = os.listdir("data/{}/wav".format(s))
    for wf in wavlist:
        # WORLD analysis for each file
        print("speaker: {} file: {}".format(s,wf))
        fs, data = wavfile.read("data/{}/wav/{}".format(s,wf))
        data = data.astype(np.float)

        f0, t = pw.harvest(data, fs)
        sp = pw.cheaptrick(data, f0, t, fs)
        ap = pw.d4c(data, f0, t, fs)

        alpha = 0.42
        dim = 24
        mgc = sptk.sp2mc(sp, dim, alpha)

        bn, _ = os.path.splitext(wf)

        with open("data/{}/mgc/{}.mgc".format(s,bn),"wb") as f:
            mgc.tofile(f)
        with open("data/{}/f0/{}.f0".format(s,bn),"wb") as f:
            f0.tofile(f)
        with open("data/{}/ap/{}.ap".format(s,bn),"wb") as f:
            ap.tofile(f)

spekaer: SF file: atr503_a06.wav
spekaer: SF file: atr503_a11.wav
spekaer: SF file: atr503_a12.wav
spekaer: SF file: atr503_a39.wav
spekaer: SF file: atr503_a05.wav
spekaer: SF file: atr503_a04.wav
spekaer: SF file: atr503_a10.wav
spekaer: SF file: atr503_a17.wav
spekaer: SF file: atr503_a38.wav
spekaer: SF file: atr503_a13.wav
spekaer: SF file: atr503_a03.wav
spekaer: SF file: atr503_a07.wav
spekaer: SF file: atr503_a16.wav
spekaer: SF file: atr503_a14.wav
spekaer: SF file: atr503_a02.wav
spekaer: SF file: atr503_a15.wav
spekaer: SF file: atr503_a48.wav
spekaer: SF file: atr503_a28.wav
spekaer: SF file: atr503_a01.wav
spekaer: SF file: atr503_a50.wav
spekaer: SF file: atr503_a47.wav
spekaer: SF file: atr503_a49.wav
spekaer: SF file: atr503_a44.wav
spekaer: SF file: atr503_a46.wav
spekaer: SF file: atr503_a29.wav
spekaer: SF file: atr503_a45.wav
spekaer: SF file: atr503_a40.wav
spekaer: SF file: atr503_a41.wav
spekaer: SF file: atr503_a24.wav
spekaer: SF file: atr503_a25.wav
spekaer: S

## フレーム毎時間アラインメント

In [4]:
import os
import sys
import array

from dtw import dtw
import numpy as np
import pysptk as sptk

In [5]:
srcspk = "SF"
tgtspk = "TF"

mgclist = os.listdir("data/{}/mgc".format(srcspk))

if not os.path.isdir("data/{}/data".format(srcspk)):
    os.mkdir("data/{}/data".format(srcspk))
if not os.path.isdir("data/{}/data".format(tgtspk)):
    os.mkdir("data/{}/data".format(tgtspk))

In [6]:
def distfunc(x,y):
    # Euclid distance except first dim
    return np.linalg.norm(x[1:]-y[1:])

In [7]:
dim = 25 # mgc dim + 1
for mf in mgclist:
    print(mf)
    bn, _ = os.path.splitext(mf)
    srcfile = "data/{}/mgc/{}".format(srcspk,mf)
    tgtfile = "data/{}/mgc/{}".format(tgtspk,mf)

    with open(srcfile,"rb") as f:
        x = np.fromfile(f, dtype="<f8", sep="")
        x = x.reshape(len(x)//dim,dim)
    with open(tgtfile,"rb") as f:
        y = np.fromfile(f, dtype="<f8", sep="")
        y = y.reshape(len(y)//dim,dim)
    print("framelen: (x,y) = {} {}".format(len(x),len(y)))
    _,_,_, twf = dtw(x,y,distfunc)
    srcout = "data/{}/data/{}.dat".format(srcspk,bn)
    tgtout = "data/{}/data/{}.dat".format(tgtspk,bn)

    with open(srcout,"wb") as f:
        x[twf[0]].tofile(f)
    with open(tgtout,"wb") as f:
        y[twf[1]].tofile(f)

atr503_a06.mgc
framelen: (x,y) = 1297 1207
atr503_a11.mgc
framelen: (x,y) = 1165 1115
atr503_a12.mgc
framelen: (x,y) = 1201 1253
atr503_a39.mgc
framelen: (x,y) = 2087 2339
atr503_a05.mgc
framelen: (x,y) = 1195 1061
atr503_a04.mgc
framelen: (x,y) = 1095 1149
atr503_a10.mgc
framelen: (x,y) = 881 891
atr503_a17.mgc
framelen: (x,y) = 1515 1321
atr503_a38.mgc
framelen: (x,y) = 1155 1243
atr503_a13.mgc
framelen: (x,y) = 1411 1377
atr503_a03.mgc
framelen: (x,y) = 1027 907
atr503_a07.mgc
framelen: (x,y) = 1817 1757
atr503_a16.mgc
framelen: (x,y) = 1183 1205
atr503_a14.mgc
framelen: (x,y) = 1211 1217
atr503_a02.mgc
framelen: (x,y) = 971 923
atr503_a15.mgc
framelen: (x,y) = 1635 1605
atr503_a48.mgc
framelen: (x,y) = 1321 1461
atr503_a28.mgc
framelen: (x,y) = 1461 1411
atr503_a01.mgc
framelen: (x,y) = 1035 1099
atr503_a50.mgc
framelen: (x,y) = 1587 1755
atr503_a47.mgc
framelen: (x,y) = 1297 1123
atr503_a49.mgc
framelen: (x,y) = 1357 1465
atr503_a44.mgc
framelen: (x,y) = 1151 1235
atr503_a46.mgc
f

## 音声変換モデルの学習

In [8]:
# Listing training/evaluation data
!mkdir -p conf
!ls data/SF/data/ | head -45 | sed -e 's/\.dat//' > conf/train.list
!ls data/SF/data/ | tail -5 | sed -e 's/\.dat//' > conf/eval.list

In [9]:
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F
import os
import sys
import time

In [10]:
def get_dataset(dim=25):
    x = []
    y = []
    datalist = []
    with open("conf/train.list","r") as f:
        for line in f:
            line = line.rstrip()
            datalist.append(line)

    for d in datalist:
        print(d)
        with open("data/SF/data/{}.dat".format(d),"rb") as f:
            dat = np.fromfile(f,dtype="<f8",sep="")
            x.append(dat.reshape(len(dat)//dim,dim))
        with open("data/TF/data/{}.dat".format(d),"rb") as f:
            dat = np.fromfile(f,dtype="<f8",sep="")
            y.append(dat.reshape(len(dat)//dim,dim))
    return x,y

In [11]:
class VCDNN(nn.Module):
        def __init__(self, dim=25, n_units=256):
            super(VCDNN, self).__init__()
            self.fc = nn.ModuleList([
                           nn.Linear(dim, n_units),
                           nn.Linear(n_units, n_units),
                           nn.Linear(n_units, dim)
            ])
            
        def forward(self, x):
            h1 = F.relu(self.fc[0](x))
            h2 = F.relu(self.fc[1](h1))
            h3 = self.fc[2](h2)
            return h3
        
        def get_predata(self, x):
            _x = torch.from_numpy(x.astype(np.float32))
            return self.forward(_x).detach().numpy()

In [12]:
x_train, y_train = get_dataset()
# parameters for training
n_epoch = 50
dim = 25
n_units = 128
N = len(x_train)

model = VCDNN(dim,n_units)
model.double()
optimizer = optim.Adam(model.parameters())

loss_fn = nn.MSELoss()

# loop
model.train()

losses = []
sum_loss = 0

for epoch in range(1, n_epoch + 1):
    sum_loss = 0

    for i in range(0, N):
        x_batch =torch.from_numpy(x_train[i])
        y_batch = torch.from_numpy(y_train[i])
        
        optimizer.zero_grad()
        
        predict_y_batch = model(x_batch)
        loss = loss_fn(predict_y_batch, y_batch)
        loss.backward()
        optimizer.step()
        sum_loss += loss.item()
        
        average_loss = sum_loss / N
        losses.append(average_loss)

        print("epoch: {}/{}  loss: {}".format(epoch, n_epoch, average_loss))

if not os.path.isdir("model"):
    os.mkdir("model")
torch.save(model.state_dict(), "model/vcmodel.model")


atr503_a01
atr503_a02
atr503_a03
atr503_a04
atr503_a05
atr503_a06
atr503_a07
atr503_a08
atr503_a09
atr503_a10
atr503_a11
atr503_a12
atr503_a13
atr503_a14
atr503_a15
atr503_a16
atr503_a17
atr503_a18
atr503_a19
atr503_a20
atr503_a21
atr503_a22
atr503_a23
atr503_a24
atr503_a25
atr503_a26
atr503_a27
atr503_a28
atr503_a29
atr503_a30
atr503_a31
atr503_a32
atr503_a33
atr503_a34
atr503_a35
atr503_a36
atr503_a37
atr503_a38
atr503_a39
atr503_a40
atr503_a41
atr503_a42
atr503_a43
atr503_a44
atr503_a45
epoch: 1/50  loss: 0.021262574215613082
epoch: 1/50  loss: 0.037385685653266935
epoch: 1/50  loss: 0.05361787450015012
epoch: 1/50  loss: 0.07157102506623084
epoch: 1/50  loss: 0.08708577539764471
epoch: 1/50  loss: 0.10125178359364552
epoch: 1/50  loss: 0.11726297395707093
epoch: 1/50  loss: 0.12992376392906163
epoch: 1/50  loss: 0.14331113777376864
epoch: 1/50  loss: 0.15514360971110128
epoch: 1/50  loss: 0.16722487395412833
epoch: 1/50  loss: 0.1780713390731625
epoch: 1/50  loss: 0.185793605949526

In [13]:
!ls ./model/

vcmodel.model


## 学習したモデルによる音声の変換

In [14]:
import numpy as np
import pysptk as sptk
import pyworld as pw
from scipy.io import wavfile
import os
import sys
import time

In [15]:
dim = 25
n_units = 128

model = VCDNN(dim,n_units)
_ = model.load_state_dict(torch.load("model/vcmodel.model"))

In [16]:
# test data
x = []
datalist = []
with open("conf/eval.list","r") as f:
    for line in f:
        line = line.rstrip()
        datalist.append(line)

for d in datalist:
    with open("data/SF/mgc/{}.mgc".format(d),"rb") as f:
        dat = np.fromfile(f,dtype="<f8",sep="")
        x.append(dat.reshape(len(dat)//dim,dim))

if not os.path.isdir("result"):
    os.mkdir("result")
if not os.path.isdir("result/wav"):
    os.mkdir("result/wav")

fs = 16000
fftlen = 512
alpha = 0.42
for i in range(0,len(datalist)):
    outfile = "result/wav/{}.wav".format(datalist[i])
    with open("data/SF/f0/{}.f0".format(datalist[i]),"rb") as f:
        f0 = np.fromfile(f, dtype="<f8", sep="")
    with open("data/SF/ap/{}.ap".format(datalist[i]),"rb") as f:
        ap = np.fromfile(f, dtype="<f8", sep="")
        ap = ap.reshape(len(ap)//(fftlen+1),fftlen+1)
    y = model.get_predata(x[i])
    y = y.astype(np.float64)
    sp = sptk.mc2sp(y, alpha, fftlen*2)
    owav = pw.synthesize(f0, sp, ap, fs)
    owav = np.clip(owav, -32768, 32767)
    wavfile.write(outfile, fs, owav.astype(np.int16))

In [17]:
!ls result/wav

atr503_a46.wav	atr503_a47.wav	atr503_a48.wav	atr503_a49.wav	atr503_a50.wav


## 演習

In [20]:
! python src/convert_f0.py

In [15]:
! python src/convert_both.py -g 0

atr503_a01
atr503_a02
atr503_a03
atr503_a04
atr503_a05
atr503_a06
atr503_a07
atr503_a08
atr503_a09
atr503_a10
atr503_a11
atr503_a12
atr503_a13
atr503_a14
atr503_a15
atr503_a16
atr503_a17
atr503_a18
atr503_a19
atr503_a20
atr503_a21
atr503_a22
atr503_a23
atr503_a24
atr503_a25
atr503_a26
atr503_a27
atr503_a28
atr503_a29
atr503_a30
atr503_a31
atr503_a32
atr503_a33
atr503_a34
atr503_a35
atr503_a36
atr503_a37
atr503_a38
atr503_a39
atr503_a40
atr503_a41
atr503_a42
atr503_a43
atr503_a44
atr503_a45
epoch: 1/50  loss: 0.021114804654559654
epoch: 1/50  loss: 0.037018653073539576
epoch: 1/50  loss: 0.053141535993313165
epoch: 1/50  loss: 0.0709555086234388
epoch: 1/50  loss: 0.08634418289226845
epoch: 1/50  loss: 0.10050577128958875
epoch: 1/50  loss: 0.11658280150722787
epoch: 1/50  loss: 0.12928971604091435
epoch: 1/50  loss: 0.14258951545643542
epoch: 1/50  loss: 0.1543365984012758
epoch: 1/50  loss: 0.16623589386226412
epoch: 1/50  loss: 0.17692241929813513
epoch: 1/50  loss: 0.184511418955469