In [1]:
import torch
import torchaudio
import os
import random
import librosa
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torchaudio import transforms
from torch.nn import Sequential
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import det_curve
import warnings
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
warnings.filterwarnings("ignore")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class AudioDataset(Dataset):
    def __init__(self, data_folder):
        self.data_folder = data_folder
        # 获取音频名列表
        self.fileList=os.listdir(data_folder)

    def __getitem__(self, idx):
        # 读取一个音频文件，返回每个音频数据
        filename = self.fileList[idx]
        waveform, _ = torchaudio.load(os.path.join(self.data_folder,filename))
        speaker_id = int(filename.split('P', 1)[0])
        flag = int(filename.split('F', 1)[1][0])

        return filename, waveform, speaker_id, flag

    def __len__(self):
        # 音频文件的总数
        return len(self.fileList)
    
class FeatureDataset(Dataset):
    def __init__(self, data_folder):
        self.data_folder = data_folder
        # 获取音频名列表
        self.fileList=os.listdir(data_folder)

    def __getitem__(self, idx):
        # 读取一个音频文件，返回每个音频数据
        filename = self.fileList[idx]
        speaker_id = int(filename.split('P', 1)[0])
        flag = int(filename.split('F', 1)[1][0])

        return filename, speaker_id, flag

    def __len__(self):
        # 音频文件的总数
        return len(self.fileList)

In [3]:
sample_rate = 48000
mel_transform = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(
        sample_rate=48000,
        n_fft = 4800,
        win_length=4800,
        hop_length=1200,
        f_min = 0,
        f_max = 8000,
        n_mels = 40,
    ),
)

spec_transform  = nn.Sequential(
    torchaudio.transforms.Spectrogram(
        n_fft = 4800,
        win_length = 4800,
        hop_length = 1200,
        center=True,
    ),
)

In [4]:
def feature_extraction(data):
    for (filename, waveform, speaker_id, flag) in data:
        mel = torchaudio.functional.lowpass_biquad(waveform, sample_rate, cutoff_freq=8000)
        mel = mel_transform(mel)
        # mel: (1, feature_dim, seq_len)
        mel = mel.squeeze(0).transpose(0, 1).to(device)
        
        spec = torchaudio.functional.lowpass_biquad(waveform, sample_rate, cutoff_freq=20300)
        spec = torchaudio.functional.highpass_biquad(spec, sample_rate, cutoff_freq=19700)
        spec = spec_transform(spec) 
        # 保留19700-20300Hz的超声频段
        spec = spec[:, 1970: 2031,:]
        # spec: (1, feature_dim, seq_len)
        spec = spec.squeeze(0).transpose(0, 1).to(device)
        # spec: [120, 122] (seq_len, feature_dim)   
        path = '/amax/home/tangsz/lstm/feature/attack/pos/'
        filename = path + filename
        torch.save({
            'mel': mel,
            'spec': spec
            }, filename)
    print('finish saving')   
    
def data_preprocessing(data, data_type):    
    mels = []
    specs =[]
    labels = []
    flags = []
    grad_specs = []
    if data_type == "train":
        path = train_path
    else :
        path = test_path
        
    for (filename, speaker_id, flag) in data:
        filename = path + filename
        feature = torch.load(filename)
        mel = feature['mel']
        spec = feature['spec']
        
        mel = torch.Tensor(mel.cpu().detach().numpy())
        spec = torch.Tensor(spec.cpu().detach().numpy())
        mel = librosa.power_to_db(mel, ref=np.max)
        spec = librosa.power_to_db(spec, ref=np.max)

        mel = torch.Tensor((mel+80)/80)
        spec = torch.Tensor((spec+80)/80)
        
        mel = torch.cat((mel, mel, mel), 0)
        if data_type == "train":
            rand = random.randrange(0, mel.shape[0] - 120)
        else:
            rand = 0
        mel = mel[rand : rand + 120, :]
        # mel: [120, 40] (seq_len, feature_dim)
    
        spec = torch.cat((spec, spec, spec, spec), 0)
        # shift   
        rand_shift = random.randrange(4, 40)
        shift_spec = spec[rand + rand_shift : rand + rand_shift + 120, :]
        
        spec = spec[rand : rand + 120, :]
        # spec: [120, 122] (seq_len, feature_dim)     
        mels.append(mel)
        specs.append(spec)
        labels.append(speaker_id)
        flags.append(flag)   
    
    if data_type == "train":
        # 加入负样本 
        # 交换: 最后一个插到最前面
        spec = specs.copy()
        spec.insert(0, spec.pop(len(spec)-1))
        specs.extend(spec)

        mels.extend(mels)
        labels.extend(labels)
        flag = [0 for i in range(len(flags))]
        flags.extend(flag)
    
    
    mels = torch.Tensor([item.cpu().detach().numpy() for item in mels])
    specs = torch.Tensor([item.cpu().detach().numpy() for item in specs])

    # torch.Size([128, 120, 61])
    # (batch, seq_len, feature_dim) 
    grad_specs = torch.from_numpy(np.diff(specs, axis=1, prepend=specs[:, 0:1, :]))
    mels = torch.tensor(mels).transpose(1, 2)
    specs = torch.tensor(specs).transpose(1, 2)
    grad_specs = grad_specs.transpose(1, 2)
    labels = torch.Tensor(labels).to(device)
    flags = torch.Tensor(flags).to(device)
    inputs = torch.cat((mels, specs, grad_specs), 1)
    
    return inputs, labels, flags

In [5]:
batch_size_train = 64
batch_size_test = 64
train_path = '/amax/home/tangsz/lstm/feature/train_feature/'
test_path = '/amax/home/tangsz/lstm/feature/new_test_feature/'
# 50187
train_data = FeatureDataset(train_path)
test_data = FeatureDataset(test_path)
# feature_extraction(data)
# train_data, test_data = torch.utils.data.random_split(data, [40150, 10037])

# feature_extraction(train_data, "train")
# feature_extraction(test_data)

FileNotFoundError: [WinError 3] 系统找不到指定的路径。: '/amax/home/tangsz/lstm/feature/train_feature/'

In [None]:
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=batch_size_train,
                                           shuffle=True,
                                           drop_last=True,
                                           collate_fn=lambda x: data_preprocessing(x, "train")
                                           )
test_loader = torch.utils.data.DataLoader(test_data,
                                          batch_size=batch_size_test,
                                          shuffle=True,
                                          drop_last=True,
                                          collate_fn=lambda x: data_preprocessing(x, "test")
                                         )
# 这么传参len少了一半！！！
print('train:', len(train_data), 'test:', len(test_data))

In [None]:
# print(inputs.shape)

In [None]:
# def plot():    
#     fig = plt.figure()
#     counter = [i for i in range(1, itera_test+1)]
#     plt.plot(counter, test_losses, color='red')
#     plt.plot(counter, avg_train_loss, color='black')
#     plt.legend(['Test Loss', 'Average Train Loss'], loc='upper right')
#     plt.xlabel('epochs')
#     plt.ylabel('loss')
#     plt.show()
def plotacc(acc):
    fig = plt.figure()
    acc_counter = [i for i in range(1, itera_test+1)]
    acc = [(100*acc[i]) for i in range(0, len(acc))]
    plt.plot(acc_counter, acc, color='red')
    plt.xlabel('epochs', fontsize=12)
    plt.ylabel('Accuracy(%)', fontsize=12)
    plt.grid()
    plt.savefig('./ACC.pdf', format='pdf')
    plt.show()
def plotacc1(acc):
    fig = plt.figure()
    acc_counter = [i for i in range(1, itera_test+1)]
    acc = [(100*acc[i]) for i in range(0, len(acc))]
    plt.plot(acc_counter, acc, color='red')
    plt.xlabel('epochs', fontsize=12)
    plt.ylabel('Accuracy(%)', fontsize=12)
    plt.grid()
    plt.savefig('./ACC1.pdf', format='pdf')
    plt.show()
def plotacc2(acc):
    fig = plt.figure()
    acc_counter = [i for i in range(1, itera_test+1)]
    acc = [(100*acc[i]) for i in range(0, len(acc))]
    plt.plot(acc_counter, acc, color='red')
    plt.xlabel('epochs', fontsize=12)
    plt.ylabel('Accuracy(%)', fontsize=12)
    plt.grid()
    plt.savefig('./ACC2.pdf', format='pdf')
    plt.show()
    
def ploteer(true, score):
    true = np.array(true).astype(np.int32)
    score = np.array(score).astype(np.float32)
    far, frr, threshold = det_curve(true, score, pos_label=1)
    eer_index = np.nanargmin(np.absolute(far - frr))
    eer = (far[eer_index] + frr[eer_index]) / 2
    plt.plot(threshold, far, threshold, frr)
    plt.legend(["FAR", "FRR"], fontsize=12)
    plt.xlabel('threshold', fontsize=12)
    plt.ylabel('FAR/FRR', fontsize=12)
    plt.grid()
    if eer<0.039 and eer>0.035:
        plt.savefig('./EER.pdf', format='pdf')
    plt.show()
    print('EER: {:.2f}% \n'.format(100. * eer))


In [None]:
''' Conv1d + BatchNorm1d + ReLU
'''
class TDNN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
        super().__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
        self.bn = nn.BatchNorm1d(out_channels)

    def forward(self, x):
        return F.relu(self.bn(self.conv(x)))



''' The SE connection of 1D case.
'''
class SE(nn.Module):
    def __init__(self, channels, s=2):
        super().__init__()
        assert channels % s == 0, "{} % {} != 0".format(channels, s)
        self.linear1 = nn.Linear(channels, channels // s)
        self.linear2 = nn.Linear(channels // s, channels)

    def forward(self, x):
        out = x.mean(dim=2)
        out = F.relu(self.linear1(out))
        out = torch.sigmoid(self.linear2(out))
        out = x * out.unsqueeze(2)
        return out
    
    
class Res2(nn.Module):
    '''
    in_channels == out_channels == channels
    '''
    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4):
        super().__init__()
        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
        self.scale = scale
        self.width = channels // scale
        self.nums = scale if scale == 1 else scale - 1

        self.convs = []
        self.bns = []
        for i in range(self.nums):
            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
            self.bns.append(nn.BatchNorm1d(self.width))
        self.convs = nn.ModuleList(self.convs)
        self.bns = nn.ModuleList(self.bns)

    def forward(self, x):
        out = []
        spx = torch.split(x, self.width, 1)
        for i in range(self.nums):
            if i == 0:
                sp = spx[i]
            else:
                sp = sp + spx[i]
            # Order: conv -> relu -> bn
            sp = self.convs[i](sp)
            sp = self.bns[i](F.relu(sp))
            out.append(sp)
        if self.scale != 1:
            out.append(spx[self.nums])
        out = torch.cat(out, dim=1)
        return out


''' SE-Res2Block.
    Note: residual connection is implemented in the ECAPA_TDNN model, not here.
'''
def SE_Res2(channels, kernel_size, stride, padding, dilation, scale):
    return nn.Sequential(
        TDNN(channels, channels, kernel_size=1, stride=1, padding=0),
        Res2(channels, kernel_size, stride, padding, dilation, scale=scale),
        TDNN(channels, channels, kernel_size=1, stride=1, padding=0),
        SE(channels)
    )



''' Attentive weighted mean and standard deviation pooling.
'''
class AttentiveStatsPool(nn.Module):
    def __init__(self, in_dim, bottleneck_dim):
        super().__init__()
        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
        self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper

    def forward(self, x):
        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
        alpha = torch.tanh(self.linear1(x))
        alpha = torch.softmax(self.linear2(alpha), dim=2)
        mean = torch.sum(alpha * x, dim=2)
        residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
        std = torch.sqrt(residuals.clamp(min=1e-9))
        return torch.cat([mean, std], dim=1)
    
    
class LSTM(nn.Module):
    def __init__(self, in_channels, channels, numSpkrs):
        super(LSTM, self).__init__()       
        self.tdnn1 = TDNN(in_channels, channels, kernel_size=5, padding=2)
        self.layer1 = SE_Res2(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
        self.layer2 = SE_Res2(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
        self.layer3 = SE_Res2(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
        self.tdnn2 = TDNN(channels * 3, 1536, kernel_size=1, dilation=1)
        
        self.lstm = nn.LSTM(input_size = 1536, hidden_size = 512, num_layers = 2)
        
        self.fc1 = nn.Sequential(
            AttentiveStatsPool(1536, 128),
            nn.BatchNorm1d(3072),
            nn.Linear(3072, numSpkrs),
            nn.BatchNorm1d(numSpkrs),
        )
        
        self.fc2 = nn.Sequential(
            nn.BatchNorm1d(512),
            nn.Linear(512, 2),
            nn.BatchNorm1d(2),
        )
        
    def forward(self, x):
        x = self.tdnn1(x)
        x1 = self.layer1(x)+x
        x2 = self.layer2(x + x1) + x + x1
        x3 = self.layer3(x + x1 + x2) + x + x1 + x2
        
        x = torch.cat([x1, x2, x3], dim=1)
        x = self.tdnn2(x)
        
        out1 = self.fc1(x)
        
        x = x.permute(2, 0, 1)
        _, (hidden, _) = self.lstm(x)
        x = hidden[-1]
        out2 = self.fc2(x)

        return out1, out2

In [None]:
learning_rate = 0.001   
log_interval = 8000 / batch_size_train
model = LSTM(in_channels=162, channels=512, numSpkrs=201).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 15, 20, 25, 30], gamma=0.1)
avg_train_loss = []
itera = 1
def train(epoch):
    global itera
    model.train()
    running_loss = 0
    train_loss = 0
    for batch_idx, data in enumerate(train_loader):
        inputs, labels, flags = data
        inputs, labels, flags = inputs.to(device), labels.to(device), flags.to(device)
        optimizer.zero_grad()
        
        out1, out2 = model(inputs)
        loss1 = criterion(out1[0:batch_size_train,:], labels[0:batch_size_train].long())
        loss2 = criterion(out2, flags.long())
        loss = 0.5/epoch/epoch * loss1 + (1-0.5/epoch/epoch) * loss2

        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        train_loss += loss.item() / len(train_loader)
        
        # 每隔10个batch输出，batch_size=64,所以每隔640个数据输出一次
        if batch_idx != 0 and batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.4f}'.format(
                epoch,
                batch_idx * len(inputs), 
                2*len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                running_loss / log_interval))
            running_loss = 0
    print('\nTrain set: Avg. loss: {:.4f}\n'.format(train_loss))
    avg_train_loss.append(train_loss)
    scheduler.step()         
    itera += 1
        
    model_path = '/amax/home/tangsz/lstm/model/final1/'+ str(epoch)
    torch.save(model.state_dict(), model_path)
    
    return model
test_losses = []
test_counter = []
acc = []
acc1 = []
acc2 = []
itera_test = 1
def test(epoch):
    global itera_test
    model.eval()
    test_loss = 0
    correct = 0
    correct1 = 0
    correct2 = 0
    target_is_nontarget = 0
    nontarget_is_target = 0
    nontarget=0
    target=0
    # 样本的实际分类标签
    true = []
    # 预测出属于正样本的概率
    score = []
    # p_{flag = 1} * p_{label = true label}
    total_score = []
    with torch.no_grad():
        for batch_idx, data in enumerate(test_loader):
            inputs, labels, flags = data
            inputs, labels, flags = inputs.to(device), labels.to(device), flags.to(device)
            # output.shape: [batchsize, 201]
            out1, out2 = model(inputs)
            loss1 = criterion(out1, labels.long())
            loss2 = criterion(out2, flags.long())
            loss = 0.5/epoch/epoch * loss1 + (1-0.5/epoch/epoch) * loss2
            test_loss += loss.item() / len(test_loader)
            # 将结果的概率归一化        
            out1 = F.softmax(out1, dim=1)
            out2 = F.softmax(out2, dim=1)
            
            labels = labels.unsqueeze(dim=1)
            flags = flags.unsqueeze(dim=1)
            # get the index of the max log-probability 
            pred1 = out1.argmax(dim=1, keepdim=True)
            # 真实label对应的概率
            value1 = out1.gather(1, labels.type(torch.int64))
            value2, pred2 = torch.max(out2.data, dim=1, keepdim=True)

#             correct += ((pred1.eq(labels))&(pred2.eq(flags))&True).sum()
#             correct1 += pred1.eq(labels).sum()
            correct2 += pred2.eq(flags).sum()
            
#             if epoch == 30:
            for i in range(batch_size_test):
                if flags[i]==0:
                    nontarget+=1
                    if pred1[i] == labels[i] and pred2[i] == 1:
                        nontarget_is_target += 1
                    if pred2[i]==0:
                            correct += 1
                else:
                    target+=1
                    if pred1[i] != labels[i] or pred2[i] == 0:
                        target_is_nontarget += 1
                    if pred1[i]==labels[i]:
                        # 只计算真实样本的
                        correct1 += 1
                        if pred2[i]==1:
                            correct += 1
                if pred2[i] == 0:
                    score.append(1-value2[i].cpu().detach().numpy())
                else:
                    score.append(value2[i].cpu().detach().numpy())
                true.append(flags[i].cpu().detach().numpy())
                total_score.append(score[i]*value1[i].cpu().detach().numpy())
                
    FAR = nontarget_is_target / nontarget
    FRR = target_is_nontarget / target
    print('FAR: {:.2f}% , FRR: {:.2f}%\n'.format(100. * FAR, 100. * FRR))            
            
    print('\nTest set: Avg. loss: {:.4f}, Accuracy1: {:.2f}%, Accuracy2: {:.2f}%, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss,
        100. * correct1 / target,
        100. * correct2 / (len(test_loader.dataset)),
        correct,
        len(test_loader.dataset),
        100. * correct / (len(test_loader.dataset))))
        
    test_losses.append(test_loss)
    acc.append(correct / (len(test_loader.dataset)))
    acc1.append(correct1 / target)
    acc2.append(correct2.cpu().numpy()  / (len(test_loader.dataset)))
    
#     plot()
    plotacc1(acc1)
    plotacc2(acc2)
    plotacc(acc)
#     if epoch == 30:
    ploteer(true, score)
#     ploteer(true, total_score)
    itera_test += 1
# epochs = 30
# for epoch in range(1, epochs + 1):
#     train(epoch)
#     test(epoch)
# print('Finished training and testing')

In [None]:
model = LSTM(in_channels=162, channels=512, numSpkrs=201).to(device)
epochs = 30
for epoch in range(1, epochs + 1):
    model_path = '/amax/home/tangsz/lstm/model/final1/'+ str(epoch)
    model.load_state_dict(torch.load(model_path))
    test(epoch)
print('Finished testing')