# Siemese Network For Speaker Verification

### Importing datasets

In [395]:
import pickle
with open('hw4_trs.pkl','rb') as train_file:
    train_df = pickle.load(train_file)
train_df.shape

(500, 16180)

In [396]:
with open('hw4_tes.pkl','rb') as test_file:
    test_df = pickle.load(test_file)
test_df.shape

(200, 22631)

# Generating Positive pairs

In [684]:
import itertools
import random
import librosa

In [685]:
def pad(x):
        target = np.zeros((45,513), dtype=int)
        source = x
        target[:x.shape[0]] = source
        return target

In [686]:
def positive_pairs(pos_speaker, L):
    comb = list(itertools.combinations(range(10), 2))
    positive_pair = []
    random_comb = random.sample(comb, L)

    for x,y in random_comb:
        stft1 = np.abs(librosa.stft(pos_speaker[x], n_fft=1024, hop_length=512)).T
        stft2 = np.abs(librosa.stft(pos_speaker[y], n_fft=1024, hop_length=512)).T
        positive_pair.append([pad(stft1), pad(stft2)])
        
    return positive_pair

# Generating Negative pairs

In [687]:
def negative_pairs(pos_i, df, L):
    df = list(df)
    
    pos = df[pos_i:pos_i+10]
    neg = df[:pos_i] + df[pos_i+10:]
    neg_speaker = random.sample(neg, L)
    negative_pair = []

    for i in range(L):
        pos_speaker = random.choice(pos)
        stft1 = np.abs(librosa.stft(pos_speaker, n_fft=1024, hop_length=512)).T
        stft2 = np.abs(librosa.stft(neg_speaker[i], n_fft=1024, hop_length=512)).T
        negative_pair.append([pad(stft1), pad(stft2)])
        
    return negative_pair

In [688]:
type(train_df)

numpy.ndarray

In [689]:
train_df[:10].shape

(10, 16180)

In [690]:
np.append(np.ones(45, dtype = int), np.zeros(45, dtype = int))

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [702]:
x_train = []
y_train = []

for i in range(50):
    pos = positive_pairs(train_df[i*10:i*10+10], 45)
    neg = negative_pairs(i*10, train_df, 45)
    x_train = x_train + pos + neg
    target = np.append(np.zeros(45, dtype = int), np.ones(45, dtype = int))
    y_train.append(target)
    
    

In [703]:
x_train = np.array(x_train)
x_train.shape

(4500, 2, 45, 513)

In [704]:
y_train = np.hstack(y_train)
y_train.shape

(4500,)

In [705]:
x_test = []
y_test = []

for i in range(20):
    pos = positive_pairs(test_df[i*10:i*10+10], 45)
    neg = negative_pairs(i*10, test_df, 45)
    x_test = x_test + pos + neg
    target = np.append(np.zeros(45, dtype = int), np.ones(45, dtype = int))
    y_test.append(target)
    
    
x_test = np.array(x_test)
print(x_test.shape)

y_test = np.hstack(y_test)
print(y_test.shape)

(1800, 2, 45, 513)
(1800,)


### Creating data loaders

In [706]:
from torch import Tensor
# Creating a train loader from the magnitude signals
dataset = torch.utils.data.TensorDataset(Tensor(x_train), Tensor(y_train))
trainloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
dataiter = iter(trainloader)
inputs, targets = dataiter.next()

print(inputs.shape)
print(targets.shape)

torch.Size([32, 2, 45, 513])
torch.Size([32])


In [707]:
# Creating a train loader from the magnitude signals
dataset = torch.utils.data.TensorDataset(Tensor(x_test), Tensor(y_test))
testloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
dataiter = iter(testloader)
inputs, targets = dataiter.next()

print(inputs.shape)
print(targets.shape)

torch.Size([32, 2, 45, 513])
torch.Size([32])


## Siemese network for speaker verification

In [749]:
class SiameseNetwork(nn.Module):
    def __init__(self,input_dim,hidden_dim, num_layers, dropout_prob):
        super(SiameseNetwork, self).__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout_prob

        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=self.dropout_prob)
        self.fc2 = nn.Linear(4500, 1)
        self.sigmoid = nn.Sigmoid()
        self.flat = nn.Flatten()
        self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        


    def forward(self, input1, input2):#did not know how to let two resnet share the same param.
        h0 = torch.zeros(self.num_layers, input1.size(0), self.hidden_dim).requires_grad_()
        out1, _ = self.gru(input1, h0)
        out1 = F.tanh(out1)

        h1 = torch.zeros(self.num_layers, input2.size(0), self.hidden_dim).requires_grad_()
        out2, _ = self.gru(input2, h1)
        out2 = F.tanh(out2)

        output = torch.mul(out1, out2)
        output = self.flat(output)
        
        output = self.fc2(output)
        output = F.sigmoid(output)
        return output

## Training the network

### Trained the model for 75 epochs using GRU and tanh as activation function

In [750]:
from torch.autograd import Variable
model_net = SiameseNetwork(513,100,2,0.2)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_net.parameters(), lr=0.0009)
model_net.train()


for epoch in range(75):
    train_loss = []
    train_accuracy = []
    for img, labels in trainloader:
        img0, img1 = img[:,0], img[:,1] 
        outputs = model_net(img0,img1)
        predicted = torch.round(outputs.data)
        loss = criterion(outputs.squeeze(), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        train_accuracy.append((predicted.squeeze() == labels).sum().item() / predicted.squeeze().size(0))
    print('Epoch: {}, Train loss: {}, Train Accuracy: {}'.format(epoch, np.round(np.mean(train_loss),4),np.round(np.mean(train_accuracy),4) ))
    

Epoch: 0, Train loss: 0.5869, Train Accuracy: 0.673
Epoch: 1, Train loss: 0.3562, Train Accuracy: 0.8539
Epoch: 2, Train loss: 0.2642, Train Accuracy: 0.8959
Epoch: 3, Train loss: 0.2036, Train Accuracy: 0.9224
Epoch: 4, Train loss: 0.1427, Train Accuracy: 0.948
Epoch: 5, Train loss: 0.1124, Train Accuracy: 0.9601
Epoch: 6, Train loss: 0.0886, Train Accuracy: 0.9701
Epoch: 7, Train loss: 0.0618, Train Accuracy: 0.9789
Epoch: 8, Train loss: 0.0575, Train Accuracy: 0.9816
Epoch: 9, Train loss: 0.0341, Train Accuracy: 0.99
Epoch: 10, Train loss: 0.0208, Train Accuracy: 0.9945
Epoch: 11, Train loss: 0.0116, Train Accuracy: 0.9978
Epoch: 12, Train loss: 0.0132, Train Accuracy: 0.9971
Epoch: 13, Train loss: 0.0241, Train Accuracy: 0.991
Epoch: 14, Train loss: 0.0341, Train Accuracy: 0.9872
Epoch: 15, Train loss: 0.0212, Train Accuracy: 0.9938
Epoch: 16, Train loss: 0.0101, Train Accuracy: 0.9973
Epoch: 17, Train loss: 0.007, Train Accuracy: 0.9978
Epoch: 18, Train loss: 0.0059, Train Accurac

## Testing the network

In [751]:
prediction_final = []
test_loss = []
test_accuracy = []
model_net.eval()
with torch.no_grad():
    for img, label in testloader:

        img0, img1 = img[:,0], img[:,1]

        outputs = model_net(img0, img1)
        predicted = torch.round(outputs)
        #print(predicted.squeeze())
        loss = criterion(outputs.squeeze(),label)
        test_loss.append(loss.item())

        #Calculating accuracy
        test_accuracy.append((predicted.squeeze() == label).sum().item() / predicted.squeeze().size(0))
        prediction_final.append(predicted)

    test_accuracy_final.append(np.round(np.mean(test_accuracy),4))
    print('Test loss: {}, Test accuracy: {}'.format(np.round(np.mean(test_loss),2), np.round(np.mean(test_accuracy),4)))

Test loss: 2.77, Test accuracy: 0.7253


## Got a test accuracy og 72%