In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
import os
orig_dir = os.getcwd()
os.chdir('/content/drive/MyDrive')
from mfcc import MFCC
os.chdir(orig_dir)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
def extract_features(file_path):
    mfccs = MFCC(file_path)
    return mfccs.T


data_dir = '/content/drive/MyDrive/SPEECH_A6/sre_new_dataset'
speaker_dirs = os.listdir(data_dir)
features = []
labels = []

for label, speaker_dir in enumerate(speaker_dirs):
    speaker_path = os.path.join(data_dir, speaker_dir)
    for wav_file in os.listdir(speaker_path):
        file_path = os.path.join(speaker_path, wav_file)
        features.append(extract_features(file_path))
        labels.append(label)

In [None]:
num_speakers = 100
slice_dim = 400
output_dim = num_speakers

In [None]:
split_features = []
split_labels = []
for i in range(len(features)):
  for j in range(0, len(features[i]), slice_dim):
    if j + slice_dim < len(features[i]):
      split_features.append(features[i][j:j+slice_dim])
      split_labels.append(labels[i])

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(split_features, split_labels, test_size=0.2)
train_features, test_features, train_labels, test_labels = torch.tensor(train_features).to(device), torch.tensor(test_features).to(device), torch.tensor(train_labels).to(device), torch.tensor(test_labels).to(device)
print(len(split_features), len(train_features), len(test_features), len(train_labels))

  train_features, test_features, train_labels, test_labels = torch.tensor(train_features).to(device), torch.tensor(test_features).to(device), torch.tensor(train_labels).to(device), torch.tensor(test_labels).to(device)


7857 6285 1572 6285


In [None]:
for i in range(len(train_features)):
  if train_features[i].shape[0] != slice_dim:
    print('Error')

In [None]:
class SpeakerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, idx):

        mfccs = self.data[idx]
        label = self.labels[idx]

        return mfccs, label

    def __len__(self):
        return len(self.data)

In [None]:
class TDNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TDNN, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=128, kernel_size=9, stride=1, padding = 4)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels= 256, kernel_size=7, stride=1, padding = 3)
        self.conv3 = nn.Conv1d(in_channels=256, out_channels= 256, kernel_size=5, stride=1, padding = 2)
        self.conv4 = nn.Conv1d(in_channels=256, out_channels= 256, kernel_size=3, stride=1, padding = 1)
        self.conv5 = nn.Conv1d(in_channels=256, out_channels=output_dim, kernel_size=1, stride=1)

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = self.conv5(x)

        x = torch.mean(x, dim=-1)

        return x

train_dataset = SpeakerDataset(train_features, train_labels)
test_dataset = SpeakerDataset(test_features, test_labels)

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
input_dim = 42
model = TDNN(input_dim=input_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)

# Define the loss function
loss_fn = nn.CrossEntropyLoss()

# Train the model
num_epochs = 85

for epoch in range(num_epochs):
    # Training loop
    model.train()
    if epoch > 40:
        optimizer.param_groups[0]['lr'] = 5e-5
    if epoch > 55:
        optimizer.param_groups[0]['lr'] = 3e-5
    if epoch > 70:
        optimizer.param_groups[0]['lr'] = 1e-5

    # optimizer.param_groups[0]['lr'] *= 0.99
    # if epoch > 25:
    #   optimizer.param_groups[0]['lr'] *= 0.98
    # if epoch > 50:
    #   optimizer.param_groups[0]['lr'] *= 0.97

    for batch_idx, (mfccs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        mfccs= mfccs.permute(0, 2, 1)
        outputs = model(mfccs.float())
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()

    with torch.no_grad():
        correct = 0
        total = 0

        for batch_idx, (mfccs, targets) in enumerate(test_loader):
            mfccs= mfccs.permute(0, 2, 1)
            outputs = model(mfccs.float())
            _, predicted = torch.max(outputs.data, 1)
            predicted = predicted.reshape(-1,)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

        accuracy = 100 * correct / total

        print(f"Epoch {epoch+1}/{num_epochs} - Validation Accuracy: {accuracy:.2f}%")

Epoch 1/85 - Validation Accuracy: 1.97%
Epoch 2/85 - Validation Accuracy: 1.72%
Epoch 3/85 - Validation Accuracy: 3.63%
Epoch 4/85 - Validation Accuracy: 4.33%
Epoch 5/85 - Validation Accuracy: 4.39%
Epoch 6/85 - Validation Accuracy: 4.71%
Epoch 7/85 - Validation Accuracy: 5.03%
Epoch 8/85 - Validation Accuracy: 6.23%
Epoch 9/85 - Validation Accuracy: 7.57%
Epoch 10/85 - Validation Accuracy: 8.59%
Epoch 11/85 - Validation Accuracy: 9.80%
Epoch 12/85 - Validation Accuracy: 14.31%
Epoch 13/85 - Validation Accuracy: 13.61%
Epoch 14/85 - Validation Accuracy: 18.45%
Epoch 15/85 - Validation Accuracy: 20.04%
Epoch 16/85 - Validation Accuracy: 19.91%
Epoch 17/85 - Validation Accuracy: 24.36%
Epoch 18/85 - Validation Accuracy: 32.12%
Epoch 19/85 - Validation Accuracy: 32.19%
Epoch 20/85 - Validation Accuracy: 36.26%
Epoch 21/85 - Validation Accuracy: 39.82%
Epoch 22/85 - Validation Accuracy: 38.68%
Epoch 23/85 - Validation Accuracy: 45.42%
Epoch 24/85 - Validation Accuracy: 47.52%
Epoch 25/85 