In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# %cd /content/gdrive/MyDrive/KLTN/source
# !git clone https://github.com/clovaai/voxceleb_trainer

In [None]:
%cd /content/gdrive/MyDrive/KLTN/source/voxceleb_trainer_component/

/content/gdrive/.shortcut-targets-by-id/19nyuDPQ1eh6564W5mYp2mmfVYxIS9Gyu/KLTN/source/voxceleb_trainer_component


In [None]:
!pip install asteroid_filterbanks -q
!pip install soundfile==0.10.3.post1 -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
librosa 0.10.0.post2 requires soundfile>=0.12.1, but you have soundfile 0.10.3.post1 which is incompatible.[0m[31m
[0m

In [None]:
!python ./trainSpeakerNet.py --config ./configs/RawNet3_AAM.yaml --train_list ../../dataset/zalo_dataset/dataset_fix/train_list.txt --train_path ../../dataset/zalo_dataset/dataset_fix/train --test_list ../../dataset/zalo_dataset/dataset_fix/veri_val.txt --test_path ../../dataset/zalo_dataset/dataset_fix/val --initial_model ../rawnet3/model.pt --max_epoch 60 --save_path /content/gdrive/MyDrive/KLTN/source/voxceleb_trainer_component/exps_component

Python Version: 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]
PyTorch Version: 2.0.1+cu118
Number of GPUs: 1
Save path: exps/RawNet3_AAM
self.encoder_type ECA
Initialised AAMSoftmax margin 0.100 scale 30.000
Initialised Adam optimizer
Initialised step LR scheduler
Model ../rawnet3/model.pt loaded!
Processing 8000 of 8000:Loss 3.409056 TEER/TAcc 44.812% - 1.24 Hz 
 2023-06-18 10:15:38 Epoch 1, TEER/TAcc 44.81, TLOSS 3.409056, LR 0.001000
Reading 900 of 973: 6.03 Hz, embedding size 256
Computing 19900 of 20000: 2933.42 Hz
 2023-06-18 10:18:25 Epoch 1, VEER 6.6000, MinDCF 0.35480
Processing 8064 of 8064:Loss 1.300100 TEER/TAcc 71.987% - 69.79 Hz 
 2023-06-18 10:26:46 Epoch 2, TEER/TAcc 71.99, TLOSS 1.300100, LR 0.001000
Reading 900 of 973: 6.22 Hz, embedding size 256
Computing 19900 of 20000: 3016.47 Hz
 2023-06-18 10:29:29 Epoch 2, VEER 6.5900, MinDCF 0.33630
Processing 8032 of 8032:Loss 1.026781 TEER/TAcc 76.008% - 71.27 Hz 
 2023-06-18 10:31:40 Epoch 3, TEER/TAcc 76.01, TLOSS 1.026

In [None]:
import torch
import torch.nn as nn
from asteroid_filterbanks import Encoder, ParamSincFB


In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F


class PreEmphasis(torch.nn.Module):
    def __init__(self, coef: float = 0.97) -> None:
        super().__init__()
        self.coef = coef
        # make kernel
        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
        self.register_buffer(
            "flipped_filter",
            torch.FloatTensor([-self.coef, 1.0]).unsqueeze(0).unsqueeze(0),
        )

    def forward(self, input: torch.tensor) -> torch.tensor:
        assert (
            len(input.size()) == 2
        ), "The number of dimensions of input tensor must be 2!"
        # reflect padding to match lengths of in/out
        input = input.unsqueeze(1)
        input = F.pad(input, (1, 0), "reflect")
        return F.conv1d(input, self.flipped_filter)


class AFMS(nn.Module):
    """
    Alpha-Feature map scaling, added to the output of each residual block[1,2].

    Reference:
    [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf
    [2] AMFS    : https://www.koreascience.or.kr/article/JAKO202029757857763.page
    """

    def __init__(self, nb_dim: int) -> None:
        super().__init__()
        self.alpha = nn.Parameter(torch.ones((nb_dim, 1)))
        self.fc = nn.Linear(nb_dim, nb_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
        y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)

        x = x + self.alpha
        x = x * y
        return x


class Bottle2neck(nn.Module):
    def __init__(
        self,
        inplanes,
        planes,
        kernel_size=None,
        dilation=None,
        scale=4,
        pool=False,
    ):

        super().__init__()

        width = int(math.floor(planes / scale))

        self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1)
        self.bn1 = nn.BatchNorm1d(width * scale)

        self.nums = scale - 1

        convs = []
        bns = []

        num_pad = math.floor(kernel_size / 2) * dilation

        for i in range(self.nums):
            convs.append(
                nn.Conv1d(
                    width,
                    width,
                    kernel_size=kernel_size,
                    dilation=dilation,
                    padding=num_pad,
                )
            )
            bns.append(nn.BatchNorm1d(width))

        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)

        self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1)
        self.bn3 = nn.BatchNorm1d(planes)

        self.relu = nn.ReLU()

        self.width = width

        self.mp = nn.MaxPool1d(pool) if pool else False
        self.afms = AFMS(planes)

        if inplanes != planes:  # if change in number of filters
            self.residual = nn.Sequential(
                nn.Conv1d(inplanes, planes, kernel_size=1, stride=1, bias=False)
            )
        else:
            self.residual = nn.Identity()

    def forward(self, x):
        residual = self.residual(x)

        out = self.conv1(x)
        out = self.relu(out)
        out = self.bn1(out)

        spx = torch.split(out, self.width, 1)
        for i in range(self.nums):
            if i == 0:
                sp = spx[i]
            else:
                sp = sp + spx[i]
            sp = self.convs[i](sp)
            sp = self.relu(sp)
            sp = self.bns[i](sp)
            if i == 0:
                out = sp
            else:
                out = torch.cat((out, sp), 1)

        out = torch.cat((out, spx[self.nums]), 1)

        out = self.conv3(out)
        out = self.relu(out)
        out = self.bn3(out)

        out += residual
        if self.mp:
            out = self.mp(out)
        out = self.afms(out)

        return out


In [None]:
class RawNet3(nn.Module):
    def __init__(self, block, model_scale, context, summed, C=1024, **kwargs):
        super().__init__()

        nOut = kwargs["nOut"]

        self.context = context
        self.encoder_type = kwargs["encoder_type"]
        self.log_sinc = kwargs["log_sinc"]
        self.norm_sinc = kwargs["norm_sinc"]
        self.out_bn = kwargs["out_bn"]
        self.summed = summed

        self.preprocess = nn.Sequential(
            PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True)
        )
        self.conv1 = Encoder(
            ParamSincFB(
                C // 4,
                251,
                stride=kwargs["sinc_stride"],
            )
        )
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(C // 4)

        self.layer1 = block(
            C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5
        )
        self.layer2 = block(
            C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3
        )
        self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale)
        self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1)

        if self.context:
            attn_input = 1536 * 3
        else:
            attn_input = 1536
        print("self.encoder_type", self.encoder_type)
        if self.encoder_type == "ECA":
            attn_output = 1536
        elif self.encoder_type == "ASP":
            attn_output = 1
        else:
            raise ValueError("Undefined encoder")

        self.attention = nn.Sequential(
            nn.Conv1d(attn_input, 128, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, attn_output, kernel_size=1),
            nn.Softmax(dim=2),
        )

        self.bn5 = nn.BatchNorm1d(3072)

        self.fc6 = nn.Linear(3072, nOut)
        self.bn6 = nn.BatchNorm1d(nOut)

        self.mp3 = nn.MaxPool1d(3)

        self.new_component = nn.Sequential(
            nn.Linear(256, 256),
            nn.Linear(256, 256),
            nn.Linear(256, 256),
        )

    def forward(self, x):
        """
        :param x: input mini-batch (bs, samp)
        """

        with torch.cuda.amp.autocast(enabled=False):
            x = self.preprocess(x)
            x = torch.abs(self.conv1(x))
            if self.log_sinc:
                x = torch.log(x + 1e-6)
            if self.norm_sinc == "mean":
                x = x - torch.mean(x, dim=-1, keepdim=True)
            elif self.norm_sinc == "mean_std":
                m = torch.mean(x, dim=-1, keepdim=True)
                s = torch.std(x, dim=-1, keepdim=True)
                s[s < 0.001] = 0.001
                x = (x - m) / s

        if self.summed:
            x1 = self.layer1(x)
            x2 = self.layer2(x1)
            x3 = self.layer3(self.mp3(x1) + x2)
        else:
            x1 = self.layer1(x)
            x2 = self.layer2(x1)
            x3 = self.layer3(x2)

        x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1))
        x = self.relu(x)

        t = x.size()[-1]

        if self.context:
            global_x = torch.cat(
                (
                    x,
                    torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t),
                    torch.sqrt(
                        torch.var(x, dim=2, keepdim=True).clamp(
                            min=1e-4, max=1e4
                        )
                    ).repeat(1, 1, t),
                ),
                dim=1,
            )
        else:
            global_x = x

        w = self.attention(global_x)

        mu = torch.sum(x * w, dim=2)
        sg = torch.sqrt(
            (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
        )

        x = torch.cat((mu, sg), 1)

        x = self.bn5(x)

        x = self.fc6(x)

        if self.out_bn:
            x = self.bn6(x)
        x = self.new_component(x)
        return x



In [None]:
model = RawNet3(
        Bottle2neck,
        model_scale=8,
        context=True,
        summed=True,
        encoder_type="ECA",
        nOut=256,
        out_bn=False,
        sinc_stride=10,
        log_sinc=True,
        norm_sinc="mean",
        grad_mult=1,
    )
for param in model.parameters():
  param.requires_grad = False
for param in model.new_component.parameters():
  param.requires_grad = True

self.encoder_type ECA


In [None]:
!pip install torchinfo -q
from torchinfo import summary
summary(model, input_size=(1, 48000)) # component

Layer (type:depth-idx)                   Output Shape              Param #
RawNet3                                  [1, 256]                  1,024
├─Sequential: 1-1                        [1, 1, 48000]             --
│    └─PreEmphasis: 2-1                  [1, 1, 48000]             --
│    └─InstanceNorm1d: 2-2               [1, 1, 48000]             (2)
├─Encoder: 1-2                           [1, 256, 4775]            256
├─Bottle2neck: 1-3                       [1, 1024, 955]            --
│    └─Sequential: 2-3                   [1, 1024, 4775]           --
│    │    └─Conv1d: 3-1                  [1, 1024, 4775]           (262,144)
│    └─Conv1d: 2-4                       [1, 1024, 4775]           (263,168)
│    └─ReLU: 2-5                         [1, 1024, 4775]           --
│    └─BatchNorm1d: 2-6                  [1, 1024, 4775]           (2,048)
│    └─ModuleList: 2-25                  --                        (recursive)
│    │    └─Conv1d: 3-2                  [1, 128, 47

In [None]:
!python ./trainSpeakerNet.py --config ./configs/RawNet3_AAM.yaml --train_list ../../dataset/zalo_dataset/dataset_fix/train_list.txt --train_path ../../dataset/zalo_dataset/dataset_fix/train --test_list ../../dataset/zalo_dataset/dataset_fix/veri_val.txt --test_path ../../dataset/zalo_dataset/dataset_fix/val --initial_model ../rawnet3/model.pt --max_epoch 20

Python Version: 3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0]
PyTorch Version: 2.0.0+cu118
Number of GPUs: 1
Save path: exps/RawNet3_AAM
self.encoder_type ECA
Initialised AAMSoftmax margin 0.100 scale 30.000
Initialised Adam optimizer
Initialised step LR scheduler
Model ../rawnet3/model.pt loaded!
Processing 8000 of 8000:Loss 3.519989 TEER/TAcc 44.475% - 4.94 Hz 
 2023-04-12 09:59:40 Epoch 1, TEER/TAcc 44.48, TLOSS 3.519989, LR 0.001000
Reading 900 of 973: 6.24 Hz, embedding size 256
Computing 19900 of 20000: 3069.37 Hz
 2023-04-12 10:02:22 Epoch 1, VEER 5.4000, MinDCF 0.33780
Processing 8064 of 8064:Loss 0.897120 TEER/TAcc 81.448% - 26.86 Hz 
 2023-04-12 10:09:03 Epoch 2, TEER/TAcc 81.45, TLOSS 0.897120, LR 0.001000
Reading 900 of 973: 6.33 Hz, embedding size 256
Computing 19900 of 20000: 3108.15 Hz
 2023-04-12 10:11:43 Epoch 2, VEER 4.6400, MinDCF 0.31150
Processing 8032 of 8032:Loss 0.638218 TEER/TAcc 86.417% - 27.04 Hz 
 2023-04-12 10:16:46 Epoch 3, TEER/TAcc 86.42, TLOSS 0.638

In [None]:
print("hi")

hi
