# Fully connected network for speech component codes


## Database

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pickle
tr = torch

In [6]:
class CodesDataset(Dataset):
    def __init__(self):
        with open("dbase.pkl", "rb") as f:
            self.db = pickle.load(f)
    
        self.n = len(self.db)
        
        
    def __len__(self):
        return self.n
    
    
    def __getitem__(self, idx):
        src = tr.from_numpy(self.db[idx]["src"])
        trg = tr.from_numpy(self.db[idx]["trg"])
        
        return {"input": src, "target": trg, "id": self.db[idx]["uttr_id"] }

In [7]:
test = CodesDataset()
test[0]

{'input': tensor([[[ 0.1466,  0.0354,  0.0592,  ...,  0.0052, -0.2884, -0.4797],
          [ 0.2820,  0.0950, -0.1234,  ...,  0.0071, -0.1590, -0.2231],
          [ 0.2846,  0.1012, -0.0919,  ...,  0.0302, -0.1551, -0.1574],
          ...,
          [-0.3304,  0.3403,  0.1219,  ...,  0.0090, -0.9641, -0.9719],
          [-0.3302,  0.3470,  0.1309,  ...,  0.0085, -0.9641, -0.9695],
          [-0.3235,  0.3069,  0.1620,  ...,  0.0074, -0.9641, -0.9490]]]),
 'target': tensor([[[ 2.6619e-01, -1.3325e-02,  1.2860e-01,  ..., -5.3857e-04,
           -1.7154e-01, -4.3410e-01],
          [ 4.8241e-01,  1.9633e-01, -2.1905e-03,  ..., -1.2183e-02,
           -8.6611e-02, -8.7461e-02],
          [ 2.0369e-01,  2.5058e-01,  3.1114e-02,  ...,  2.2069e-02,
           -2.0906e-01, -1.9413e-01],
          ...,
          [-3.3287e-01,  3.6905e-01,  1.9288e-01,  ...,  9.5199e-03,
           -9.2972e-01, -9.3548e-01],
          [-3.2955e-01,  3.7391e-01,  2.0085e-01,  ...,  9.1452e-03,
           -9.2972e

## Network

In [17]:
class CodeConverter(tr.nn.Module):
    def __init__(self, layer_dims=None, activation=None, dropout_p=0):
        super().__init__()
        self.layer_dims = layer_dims
        self.activation = activation
        if self.layer_dims is None:
            self.layer_dims = [24*82, 1024, 512, 512, 1024, 24*82]
        
        if self.activation is None:
            self.activation = tr.nn.LeakyReLU()
            
        self.layers = tr.nn.ModuleList()
        self.layers.append(tr.nn.Flatten())
        
        n_layers = len(self.layer_dims)
        for cnt, (i, o) in enumerate(zip(self.layer_dims, self.layer_dims[1:])):
            self.layers.append(tr.nn.Linear(i, o))
            
            # do not apply activation on last layer
            if cnt < n_layers-1:
                self.layers.append(self.activation)
                
            # apply dropout on hidden layers
            if cnt > 0 and cnt < n_layers-1:
                self.layers.append(tr.nn.Dropout(dropout_p))
            
            
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        
        # codes are in range -1, 1
        x = tr.tanh(x)
        
        return x.view(x.shape[0], 24, 82)


In [18]:
code_converter = CodeConverter()
y = code_converter(tr.randn(1, 24, 82))
print(y.shape, y)

torch.Size([1, 24, 82]) tensor([[[-5.7939e-04,  9.6043e-03, -1.3084e-04,  ..., -2.7354e-04,
           4.6815e-02, -2.1515e-04],
         [-2.3149e-04,  2.8743e-02, -1.6837e-04,  ...,  2.8753e-02,
           5.6629e-02, -2.5217e-04],
         [-1.5955e-04,  3.5345e-02, -2.2946e-04,  ...,  3.4506e-02,
          -2.9878e-05,  5.6669e-03],
         ...,
         [-2.6281e-04,  2.1909e-02,  2.7877e-03,  ..., -1.0122e-05,
          -2.4419e-04, -1.5579e-04],
         [ 3.0493e-02,  1.3447e-02,  1.5064e-02,  ..., -7.7468e-05,
          -2.6115e-04,  9.8044e-03],
         [ 4.0743e-03, -2.6301e-05,  7.5645e-02,  ...,  3.9122e-03,
          -2.6865e-04, -3.7632e-04]]], grad_fn=<ViewBackward>)


In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(count_parameters(code_converter))

5346224


In [21]:
cc = CodeConverter(layer_dims=[24*82, 1024, 1024, 24*82])
print(count_parameters(cc))

5083056
