In [36]:
import librosa
import librosa.display
import torch
import torchaudio
import torchvision
from IPython.display import Audio
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score
import torchvision.transforms as transforms

In [39]:
def random_seed(rs=10):
    np.random.seed(rs)
    torch.manual_seed(rs)
    torch.cuda.manual_seed(rs)
    torch.backends.cudnn.deterministic = True

random_seed()

In [40]:
class AudioDataset(torch.utils.data.Dataset):
  def __init__(self, training=True, sample_rate=24000, n_mels=64, new_size=299, train_size=25000):
    dataset = torchaudio.datasets.LIBRITTS('.', download=True)
    # Разобъём на тренировочную и валидационную выборки
    validation_size = len(dataset)-train_size
    train, validation = torch.utils.data.random_split(dataset, [train_size, validation_size])
    if training:
      self.data = train
    else:
      self.data = validation
    self.sample_rate=sample_rate
    self.n_mels = n_mels
    self.resampling = torchaudio.transforms.Resample(new_freq=sample_rate)
    self.transform = transforms.Compose([transforms.ToPILImage(),
                                         transforms.Resize((new_size, new_size)), transforms.ToTensor()])
   
    df = pd.read_csv('speakers.tsv', sep='\t',index_col=None).reset_index()
    self.speakers = dict(zip(df['index'], df['READER']))

  def mel_spectr(self, wave):
    # Преобразование Фурье с логарифмической шкалой частот 
    transform = torchaudio.transforms.MelSpectrogram(n_mels=self.n_mels, sample_rate=self.sample_rate)
    # Переведем амплитуду в децибелы 
    to_db = torchaudio.transforms.AmplitudeToDB()

    spec = transform(wave)
    spec = to_db(spec)
    spec = torch.squeeze(spec)
    return spec

  def spec_to_img(self, spec):
    mean, std = torch.mean(spec), torch.std(spec)
    # отцентрируем и нормализуем спектр в диапазоне от 0 до 255
    spec = (spec-mean)/(std+0.000001)
    min_val, max_val = torch.min(spec), torch.max(spec)
    spec = 255*(spec -min_val)/(max_val- min_val)
    return spec

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):

    item = list(self.data[index])
    waveform = item[0]
   
    waveform = self.resampling(waveform)
    waveform = self.mel_spectr(waveform)
    waveform = self.spec_to_img(waveform)
    waveform = self.transform(waveform)
    speaker_id = item[4]
    gender = self.speakers[speaker_id]
    if gender == 'M':
      gender = 1.
    elif gender == 'F':
      gender = 0.
    return waveform.float(), gender

In [41]:
class Model(torch.nn.Module):
    def __init__(self):
      super().__init__()
      self.conv = torch.nn.Conv2d(in_channels=1, out_channels=3, kernel_size=1)
      self.inception  = torchvision.models.inception_v3(pretrained=True)
      for parameter in self.inception.parameters():
          parameter.requires_grad = False
      self.inception.aux_logits=False
      in_features = self.inception.fc.in_features
      self.inception.fc =  torch.nn.Linear(in_features=in_features, out_features=1)
      self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.conv(x)
        x = self.inception(x)
        x = self.sigmoid(x)
        return x

In [42]:
class CNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.c = torch.nn.Conv2d(in_channels= 1, out_channels=3, kernel_size=1)
        self.conv1 = torch.nn.Conv2d(in_channels= 3, out_channels=64, kernel_size=3, stride=2, padding=2, dilation=5)
        self.bn1 = torch.nn.BatchNorm2d(64)
        self.conv2 = torch.nn.Conv2d(in_channels=64, out_channels=4, kernel_size=3, padding=2, dilation=3)
        self.bn2 = torch.nn.BatchNorm2d(4)
        self.conv1x1 = torch.nn.Conv2d(in_channels=4, out_channels=1, kernel_size=1)
        self.adpool = torch.nn.AdaptiveAvgPool2d((5, 5))
        self.flatten = torch.nn.Flatten()
        self.fc = torch.nn.Linear(in_features=25, out_features=1, bias=True)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.c(x)
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
        x = self.conv1x1(x)
        x = self.adpool(x)
        x = self.flatten(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x


In [43]:
def test(model, dataloader, device_name, threshold=0.5):
    device = torch.device(device_name)
    model = model.to(device)
    predicted_labels = []
    predicted_probabilities = []
    loss_func = torch.nn.BCELoss()
    labels = []
    test_loss = 0
    
    test_loader = iter(dataloader)
    model.eval()

    with torch.no_grad():
        for batch, (X, Y) in enumerate((test_loader)):
            X, Y = X.to(device), Y.to(device)
            pred = model(X)

            pred_labels = pred >= threshold
            loss = loss_func(pred.double(), Y.unsqueeze(1))

            labels.extend(Y.tolist())
            predicted_labels.extend(pred_labels.squeeze().tolist())
            predicted_probabilities.extend(pred.squeeze().tolist())
            test_loss += loss.item()

    accuracy = accuracy_score(labels, predicted_labels)
    rocauc = roc_auc_score(labels, predicted_probabilities)
    print('Test Loss: %0.2f %% ' % (test_loss),
            'Accuracy: %0.2f %% ' % (accuracy),
            'RocAUC: %0.2f %% ' % (rocauc))


def train(dataloader, testloader, device_name, model, lr=0.001, num_epoch=30, path = './weights',
          gamma=0.2, milestones=[25, 40]):
    device = torch.device(device_name if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr = lr)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma)
    loss_func = torch.nn.BCELoss()

    for epoch in tqdm(range(num_epoch)):
        train_loader = iter(dataloader)
        train_loss = 0
        for batch, (X, Y) in enumerate(train_loader):
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            pred = model(X)
            
            loss = loss_func(pred.double(), Y.unsqueeze(1))
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        if epoch % 15 == 0:
            print("Epoch:  ", epoch, "    Train Loss:  ", train_loss)
        if epoch % 2 == 0:
            torch.save(model.state_dict(), path)
        # if epoch % 8 == 0 and epoch >0:
            # test(model, testloader, device, inception=inception)
        if epoch % 1 == 0 :
            test(model, testloader, device)

    return model


*Inception model*

In [27]:
train_data = AudioDataset(training=True, train_size=25000)
validation = AudioDataset(training=False)

train_loader = torch.utils.data.DataLoader(train_data,  batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(validation,  batch_size=64)

In [46]:
model = Model()
model = train(train_loader, test_loader, 'cuda:0', model, lr=0.01, num_epoch=2, 
              path="./weights_inception")

*Dilated CNN model*

In [None]:
train_data = AudioDataset(training=True, new_size=100)
validation = AudioDataset(training=False, new_size=100)
train_loader_cnn = torch.utils.data.DataLoader(train_data,  batch_size=64, shuffle=True)
test_loader_cnn = torch.utils.data.DataLoader(validation,  batch_size=64)

In [9]:
cnn_model = CNN()

In [None]:
cnn_model = train(train_loader_cnn, test_loader_cnn, 'cuda:0', cnn_model, lr=0.01, num_epoch=30, path="./weights_cnn",
                  milestones=[8, 27])

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch:   0     Train Loss:   270.4504811123195
Test Loss: 113.63 %  Accuracy: 0.47 %  RocAUC: 0.53 % 
Epoch:   15     Train Loss:   270.2598693713859
Test Loss: 89.17 %  Accuracy: 0.53 %  RocAUC: 0.51 % 


In [None]:
cnn_model = train(train_loader_cnn, test_loader_cnn, 'cuda:0', cnn_model, lr=0.001, num_epoch=21, path="./weights_cnn",
                  milestones=[9, 18])

  0%|          | 0/21 [00:00<?, ?it/s]

Epoch:   0     Train Loss:   269.09093327473437


In [13]:
cnn_model.load_state_dict(torch.load("weights_cnn(4)", map_location=torch.device('cpu')))

<All keys matched successfully>

In [15]:
cnn_model = train(train_loader_cnn, test_loader_cnn, 'cpu', cnn_model, lr=0.001, num_epoch=25, path="./weights_cnn",
                  milestones=[6, 21])

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch:   0     Train Loss:   269.72026048027664
Test Loss: 88.85 %  Accuracy: 0.54 %  RocAUC: 0.55 % 
Epoch:   15     Train Loss:   269.0701161756675
Test Loss: 88.83 %  Accuracy: 0.54 %  RocAUC: 0.54 % 
Test Loss: 88.75 %  Accuracy: 0.54 %  RocAUC: 0.55 % 


In [16]:
test(cnn_model, test_loader_cnn, 'cpu')

Test Loss: 88.75 %  Accuracy: 0.54 %  RocAUC: 0.55 % 
