In [1]:
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import torch.nn.init as init
import math
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader,Dataset
import random

In [2]:

# 加载音频文件
audio_path = 'demo.wav'
y, sr = librosa.load(audio_path, sr=None)

# 提取梅尔频谱图
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
log_S = librosa.power_to_db(S, ref=np.max)

# 显示梅尔频谱图
# plt.figure(figsize=(10, 4))
# librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel', fmax=8000)
# plt.colorbar(format='%+2.0f dB')
# plt.title('Mel spectrogram')
# plt.tight_layout()
# plt.show()
z = np.pad(y,(20,),"constant",constant_values=(0,))
z.shape,sr

((98600,), 22050)

In [3]:
class affine_coupling_layer(nn.Module):
    def __init__(self,in_dim,hidden_dim):
        super().__init__()
        self.in_dim = in_dim
        self.scale_translate_net = nn.Sequential(
            nn.Linear(in_dim ,hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim,hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim,in_dim)
        )
    def forward(self,xa,xb,mel):
        log_s,b = self.scale_translate_net(torch.cat((xa,mel),-1)).trunk(2,dim=-1)
        ya = xa
        yb = xb*torch.exp(log_s) + b
        log_det_jacobian = log_s.sum(dim = -1)
        return ya,yb,log_det_jacobian
    def inverse(self,y):
        ya,yb = y.trunk(2,dim = -1)
        log_s , b = self.scale_translate_net(ya)
        xa = ya
        xb = (yb -b) / log_s
        return xa,xb
    


In [4]:
class Flow(nn.Module):
    def __init__(self,n_mel_channels,coupling_layer_num,segment_len):
        super().__init__()
        self.n_mel_channels = n_mel_channels
        self.coupling_layer_num = coupling_layer_num
        
        self.transcov = nn.ConvTranspose1d(n_mel_channels,1,2048,stride=512)
        self.coupling_layer = nn.ModuleList()
        for _ in range(coupling_layer_num):
            self.coupling_layer.append(affine_coupling_layer(segment_len,segment_len))
    def forward(self,x,mel):
        mel = self.transcov(mel).unsqueeze(1)
        for i in range(self.coupling_layer_num):
            xa,xb = x.split(x.shape[-1]//2,-1)
            ya,yb,log_det_jecobian = self.coupling_layer[i](xa,xb,mel)
            x = torch.cat(yb,ya)
        return x,log_det_jecobian

        


In [5]:
class FlowLoss(nn.Module):
    def forward(self,z,log_S_list):
        loss_z = torch.sum(z*z)/(2)
        loss_s = torch.zeros(1)
        for i in len(log_S_list):
            loss_s += torch.sum(log_S_list[i])
        return loss_z - loss_s

In [6]:
class FlowDateSet(Dataset):
    def __init__(self,file_path_list,segment_length):
        super().__init__()
        self.file_path_list = file_path_list
        self.segment_length = segment_length
    def __getitem__(self, index):
        path = self.file_path_list[index]
        y, sr = librosa.load(path, sr=None)
        # 提取梅尔频谱图
        audio_len = len(y)
        if audio_len > self.segment_length:
            seg_begin = random.randint(0,audio_len-self.segment_length)         
            y = y[seg_begin:seg_begin+self.segment_length]
        else:
            y = np.pad(y,(0,self.segment_length-audio_len),"constant",constant_values=(0,))
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,center=False)
        mel = librosa.power_to_db(mel, ref=np.max)
        return y,mel
        
    def __len__(self):
        return len(self.file_path_list)



In [7]:
epoch = 10
lr = 1e-5
batch_size = 1

# model 
num_coupling_layer = 2

# features
n_mel_channels = 128
segment_len = 22016
window_size = 2048  
hop_size = 512
frame_len = (segment_len - window_size) / hop_size  + 1

In [8]:
critirien = FlowLoss().to("cuda")
model = Flow(n_mel_channels,num_coupling_layer,segment_len).to("cuda")
dataset = FlowDateSet(["./demo.wav"],segment_len)
dataloader = DataLoader(dataset,batch_size=batch_size )
optim = torch.optim.SGD(model.parameters(),lr=lr)
for i in range(epoch):
    for x,mel in dataloader:
        z,log_det_jecobion = model(x.to("cuda"),mel.to("cuda"))
        loss = critirien(z,log_det_jecobion)
        optim.zero_grad()
        loss.backward()
        optim.step()




RuntimeError: Tensors must have same number of dimensions: got 2 and 4