# Siamese Network
Metric learning by speech commands dataset (30min.)

In [None]:
%%shell
pip install torchaudio

In [None]:
%%shell
git clone https://github.com/tky823/DNN-based_source_separation.git

In [None]:
%cd "/content/DNN-based_source_separation/egs/tutorials/siamese-net"

In [None]:
import sys
sys.path.append("/content/DNN-based_source_separation/src")

In [None]:
import os
import random

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.rcParams['font.size'] = 18

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision, torchaudio
from torchaudio.datasets import SPEECHCOMMANDS

In [None]:
from criterion.distance import L2Loss
from criterion.metric_learn import ContrastiveWithDistanceLoss

## Generate dataset
we use only 5 classes.

In [None]:
classes = [
    "zero", "one", "two", "three", "four"
]

In [None]:
class SpeechCommandsDataset(SPEECHCOMMANDS):
    def __init__(self, subset=None, transform=None, classes=classes):
        super().__init__("./", download=True)

        self.transform = transform
        self.classes = classes

        def load_all_paths(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                paths = [os.path.join(self._path, line.strip()) for line in fileobj]
            return paths
        
        def extract_number(paths):
            target_paths = []
            targets = []
            for path in paths:
                label = path.split("/")[-2]
                if label in self.classes:
                    target_paths.append(path)
                    targets.append(self.classes.index(label))
            
            return target_paths, torch.tensor(targets)

        if subset == "validation":
            paths = load_all_paths("validation_list.txt")
            self._walker, self.targets = extract_number(paths)
        elif subset == "testing":
            paths = load_all_paths("testing_list.txt")
            self._walker, self.targets = extract_number(paths)
        elif subset == "training":
            excludes = load_all_paths("validation_list.txt") + load_all_paths("testing_list.txt")
            excludes = set(excludes)
            paths = [w for w in self._walker if w not in excludes]
            self._walker, self.targets = extract_number(paths)
    
    def __getitem__(self, idx):
        input, sr, label, speaker_id, utterance_id = super().__getitem__(idx)
        target = self.classes.index(label)
        
        padding = sr - input.size(-1)
        padding_left = padding // 2
        padding_right = padding - padding_left
        input = F.pad(input, (padding_left, padding_right))

        if self.transform is not None:
            input = self.transform(input)

        return input, target

In [None]:
class PairedSpeechCommandsDataset:
    def __init__(self, subset="training", transform=None, num_samples=None):
        self.original_dataset = SpeechCommandsDataset(subset=subset, transform=transform)
        self.n_class = len(self.original_dataset.classes)

        self.class_list = list(range(self.n_class))
        self.target_list = []
        for class_idx in self.class_list:
            self.target_list.append(torch.where(self.original_dataset.targets==class_idx)[0].tolist())

        if num_samples is None:
            self.num_samples = len(self.original_dataset)
        else:
            self.num_samples = num_samples
    
    def __getitem__(self, idx):
        random.shuffle(self.class_list)
        positive_class = self.class_list[0]
        negative_class = self.class_list[1]
        is_same = random.randint(0, 1)

        if is_same == 1:
            idx_left, idx_right = random.sample(self.target_list[positive_class], 2)
        else:
            idx_left = random.choice(self.target_list[positive_class])
            idx_right = random.choice(self.target_list[negative_class])

        (input_left, _), (input_right, _) = self.original_dataset[idx_left], self.original_dataset[idx_right]

        return input_left, input_right, is_same
    
    def __len__(self):
        return self.num_samples

In [None]:
random.seed(111)
torch.manual_seed(111)
num_samples = 500000
batch_size = 64
sr = 16000

In [None]:
class Log10:
    def __init__(self, eps=1e-12):
        self.eps = eps
    
    def __call__(self, input):
        return 10 * torch.log10(input + self.eps)

In [None]:
transform = torchvision.transforms.Compose([
    torchaudio.transforms.Spectrogram(),
    Log10()
])

In [None]:
train_dataset = PairedSpeechCommandsDataset("training", transform=transform, num_samples=num_samples)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

## Model

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dropout=0.3):
        super().__init__()
        self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride)
        self.prelu = nn.PReLU()
        self.pool2d = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input):
        x = self.conv2d(input)
        x = self.prelu(x)
        x = self.pool2d(x)
        output = self.dropout(x)

        return output

class BasicModel(nn.Module):
    def __init__(self, embed_dim=2, dropout=0.3):
        super().__init__()

        net = []
        net.append(ConvBlock(1, 16, 5))
        net.append(ConvBlock(16, 32, 3))
        net.append(ConvBlock(32, 64, 3))
        net.append(ConvBlock(64, 128, 3))

        fc_net = []
        fc_net.append(nn.Linear(128*10*3, 512))
        fc_net.append(nn.PReLU())
        fc_net.append(nn.Linear(512, embed_dim))

        self.net = nn.Sequential(*net)
        self.fc_net = nn.Sequential(*fc_net)
        
    def forward(self, input):
        x = self.net(input)
        x = x.view(-1, 128*10*3)
        output = self.fc_net(x)
        
        return output

In [None]:
model = BasicModel()

In [None]:
print(model)

In [None]:
if torch.cuda.is_available():
    model.cuda()

## Training
Enbed in 2D

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = L2Loss()
contrastive_criterion = ContrastiveWithDistanceLoss(distance_fn=criterion)

In [None]:
model.train()

train_loss = []
for idx, (input_left, input_right, is_same) in enumerate(train_loader):
    if torch.cuda.is_available():
        input_left, input_right, is_same = input_left.cuda(), input_right.cuda(), is_same.cuda()

    optimizer.zero_grad()

    output_left = model(input_left)
    output_right = model(input_right)

    loss = contrastive_criterion(output_left, output_right, is_same)
    loss.backward()
    optimizer.step()

    if (idx + 1) % 50 == 0:
        print("{}/{} Loss: {:.5f}".format(idx + 1, len(train_loader), loss.item()))
    
    train_loss.append(loss.item())

In [None]:
train_loss = np.array(train_loss)
average_loss = 0

for i in range(100):
    average_loss = average_loss + train_loss[i: -100 + i]

average_loss = average_loss / 100

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(train_loss[100:], color='deepskyblue')
plt.plot(average_loss, color='black')
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.show()
# plt.savefig("./loss/train_loss.png", bbox_inches='tight')

In [None]:
# For visualization
train_dataset = SpeechCommandsDataset("training", transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False)

In [None]:
model.eval()

x = []
labels = []

with torch.no_grad():
    for input, target in train_loader:
        if torch.cuda.is_available():
            input, target = input.cuda(), target.cuda()
        output = model(input)
        x.append(output.squeeze(dim=0).cpu().numpy())
        label = target.squeeze(dim=0).cpu().item()
        labels.append(label)

x = np.array(x)
labels = np.array(labels)

In [None]:
plt.figure(figsize=(12, 8))

for class_idx, label in enumerate(test_dataset.classes):
    x_class = x[labels == class_idx]
    plt.scatter(x_class[:, 0], x_class[:, 1], label=label)

plt.legend()
plt.show()
# plt.savefig("./embedding/train_embedding.png", bbox_inches='tight')

## Test

In [None]:
test_dataset = SpeechCommandsDataset("testing", transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
model.eval()

x = []
labels = []

with torch.no_grad():
    for input, target in test_loader:
        if torch.cuda.is_available():
            input, target = input.cuda(), target.cuda()
        output = model(input)
        x.append(output.squeeze(dim=0).cpu().numpy())
        label = target.squeeze(dim=0).cpu().item()
        labels.append(label)

x = np.array(x)
labels = np.array(labels)

In [None]:
plt.figure(figsize=(12, 8))

for class_idx, label in enumerate(test_dataset.classes):
    x_class = x[labels == class_idx]
    plt.scatter(x_class[:, 0], x_class[:, 1], label=label)

plt.legend()
plt.show()
# plt.savefig("./embedding/train_embedding.png", bbox_inches='tight')