<h1>Demo</h1>

load tensor flow dataset

convert to torch dataset

test cnn resnet

In [22]:
import torch 
import torchaudio
import torchaudio.transforms as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
#tensorflow convert to torch dataset
class SpeechCommandsDataset(torch.utils.data.Dataset):
    def __init__(self, split='train',preprocess=lambda x: x):
        self.data = []
        # download the dataset form google drive and save on current directory
        data=tfds.load('speech_commands',split=split,data_dir='./tensorflow_datasets',download=False)
        for example in data:
            audio=example['audio']
            audio=np.array(audio)
            audio=torch.tensor(audio)
            audio=preprocess(audio)
            label = torch.tensor(int(example['label']))
            self.data.append((audio, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


In [24]:
def preprocess(audio):
    # resizes the audio to 16000 samples
    if audio.size(0) < 16000:
        audio = F.pad(input=audio, pad=(0, 16000 - audio.size(0)), mode='constant', value=0)
    
    # normalizes the audio
    audio = audio / 32768.0
    return audio

#check the dataset
train_dataset = SpeechCommandsDataset('train', preprocess=preprocess)
valid_dataset = SpeechCommandsDataset('validation', preprocess=preprocess)
test_dataset = SpeechCommandsDataset('test', preprocess=preprocess)


In [25]:
#dataset loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=True)

#check the loader
for (data,target) in train_loader:
    print(data.shape)
    print(data[0])
    print(target)
    break


torch.Size([64, 16000])
tensor([0.0047, 0.0218, 0.0165,  ..., 0.0240, 0.0307, 0.0124])
tensor([11, 11, 11,  6, 10, 11,  8,  4,  3,  3,  5,  9, 11, 11, 11, 11,  5, 11,
        11, 11,  1, 11, 11, 11,  6,  9, 11, 11,  6,  5, 11, 11, 11,  3, 11, 11,
        11, 11, 11, 11,  9, 11, 11, 11, 11, 11,  4, 11, 11, 11, 11,  2,  2, 11,
        11,  3,  2, 11,  7, 11,  4, 11, 11, 11])


In [26]:
# cnn 
class SpeechCommandsCNN(nn.Module):
    def __init__(self, n_input=1, n_output=12, stride=32, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=2)
    
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

model = SpeechCommandsCNN()
print("number of parameters: ", count_parameters(model))

# training

def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    for batch_idx, (data,target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)
        data = data.view(data.shape[0], 1, data.shape[1])
        # print(data.shape)

        optimizer.zero_grad()
        output = model(data)
        # print(output.shape)
        # print(target.shape)
        # print(output.squeeze().shape)
        # break
        
        loss = F.nll_loss(output.squeeze(), target)
        loss.backward()
        optimizer.step()

        pbar.update(pbar_update)
        if batch_idx % log_interval == 0:
            # print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")
            pbar.set_postfix_str(f"Loss: {loss.item():.6f}")
def valid(model, device, val_loader, epoch):
    val_loss=0
    correct=0
    model.eval()
    with torch.no_grad():
        for data, target in val_loader:
            data = data.to(device)
            target = target.to(device)
            data = data.view(data.shape[0], 1, data.shape[1])
            output = model(data)
            val_loss += F.nll_loss(output.squeeze(), target, reduction='sum').item()
            pred = output.argmax(dim=2, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            pbar.update(pbar_update)
    val_loss /= len(val_loader.dataset)
    print(f"Valid Epoch: {epoch}\tAccuracy: {correct}/{len(val_loader.dataset)} ({100. * correct / len(val_loader.dataset):.0f}%)",end='\t')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            data = data.view(data.shape[0], 1, data.shape[1])
            output = model(data)
            test_loss += F.nll_loss(output.squeeze(), target, reduction='sum').item()
            pred = output.argmax(dim=2, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)")
    print()
    



batch_size = 256

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

model = SpeechCommandsCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
n_epoch = 10
log_interval = 10
pbar_update = 1 / (len(train_loader)+len(val_loader))

with tqdm(total=n_epoch) as pbar:
    for epoch in range(1, n_epoch + 1):
        train(model,device,train_loader,optimizer, epoch, log_interval)
        valid(model,device,val_loader,epoch)
test(model,device,test_loader)

number of parameters:  25420


 10%|█         | 1.0186666666666724/10 [00:08<00:55,  6.13s/it, Loss: 0.832583]   

Valid Epoch: 1	Accuracy: 6597/10102 (65%)	

 20%|██        | 2.0053333333333128/10 [00:15<00:46,  5.82s/it, Loss: 0.771579]

Valid Epoch: 2	Accuracy: 7829/10102 (77%)	

 30%|███       | 3.02133333333337/10 [00:23<00:50,  7.18s/it, Loss: 0.456503]  

Valid Epoch: 3	Accuracy: 7997/10102 (79%)	

 40%|████      | 4.018666666666756/10 [00:32<00:38,  6.38s/it, Loss: 0.436752] 

Valid Epoch: 4	Accuracy: 7437/10102 (74%)	

 50%|█████     | 5.005333333333314/10 [00:39<00:28,  5.70s/it, Loss: 0.388748] 

Valid Epoch: 5	Accuracy: 6921/10102 (69%)	

 60%|██████    | 6.023999999999869/10 [00:46<00:27,  6.95s/it, Loss: 0.271213] 

Valid Epoch: 6	Accuracy: 7812/10102 (77%)	

 70%|███████   | 7.005333333333094/10 [00:54<00:17,  5.99s/it, Loss: 0.433177] 

Valid Epoch: 7	Accuracy: 8277/10102 (82%)	

 80%|████████  | 8.013333333332984/10 [01:01<00:11,  5.76s/it, Loss: 0.406713] 

Valid Epoch: 8	Accuracy: 8099/10102 (80%)	

 90%|█████████ | 9.010666666666207/10 [01:09<00:05,  5.96s/it, Loss: 0.574722]

Valid Epoch: 9	Accuracy: 8741/10102 (87%)	

100%|█████████▉| 9.999999999999432/10 [01:16<00:00,  7.66s/it, Loss: 0.384388]


Valid Epoch: 10	Accuracy: 8738/10102 (86%)	
Test Epoch: 10	Accuracy: 3536/4890 (72%)



In [28]:
#res net
class ResidualBlock(nn.Module):
    def __init__(self, n_channel):
        super().__init__()
        self.conv1 = nn.Conv1d(n_channel, n_channel, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(n_channel)

    def forward(self, x):
        y = F.relu(self.bn1(self.conv1(x)))
        y = self.bn2(self.conv2(y))
        return F.relu(x + y)
    
class ResNet(nn.Module):
    def __init__(self, n_input=1, n_output=12, n_channel=45, n_res_block=20):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=4)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool = nn.MaxPool1d(4)
        self.resblocks = nn.Sequential(*[ResidualBlock(n_channel) for _ in range(n_res_block)])
        self.fc1 = nn.Linear(n_channel, n_output)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.resblocks(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=2)

model = ResNet(n_res_block=10).to(device)
print("number of parameters: ", count_parameters(model))

batch_size = 256

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=0.01)
n_epoch = 10
log_interval = 10
pbar_update = 1 / (len(train_loader)+len(val_loader))

with tqdm(total=n_epoch) as pbar:
    for epoch in range(1, n_epoch + 1):
        train(model,device,train_loader,optimizer, epoch, log_interval)
        valid(model,device,val_loader,epoch)
test(model,device,test_loader)


number of parameters:  128487


 10%|█         | 1.0026666666666728/10 [00:27<01:40, 11.17s/it, Loss: 1.244031]   

Valid Epoch: 1	Accuracy: 3473/10102 (34%)	

 20%|██        | 2.0053333333333128/10 [00:54<01:48, 13.63s/it, Loss: 0.976835]

Valid Epoch: 2	Accuracy: 5236/10102 (52%)	

 30%|███       | 3.0026666666667023/10 [01:23<01:24, 12.08s/it, Loss: 0.858800]

Valid Epoch: 3	Accuracy: 1347/10102 (13%)	

 40%|████      | 4.002666666666758/10 [01:53<01:09, 11.67s/it, Loss: 0.532914] 

Valid Epoch: 4	Accuracy: 5658/10102 (56%)	

 50%|█████     | 5.005333333333314/10 [02:21<01:08, 13.79s/it, Loss: 0.480026] 

Valid Epoch: 5	Accuracy: 5984/10102 (59%)	

 60%|██████    | 6.002666666666538/10 [02:48<00:47, 11.83s/it, Loss: 0.642656] 

Valid Epoch: 6	Accuracy: 7606/10102 (75%)	

 70%|███████   | 7.005333333333094/10 [03:16<00:46, 15.55s/it, Loss: 0.625366] 

Valid Epoch: 7	Accuracy: 3180/10102 (31%)	

 80%|████████  | 8.002666666666318/10 [03:47<00:24, 12.51s/it, Loss: 0.625673] 

Valid Epoch: 8	Accuracy: 6041/10102 (60%)	

 90%|█████████ | 9.005333333332874/10 [04:19<00:13, 14.07s/it, Loss: 0.470058]

Valid Epoch: 9	Accuracy: 8584/10102 (85%)	

100%|█████████▉| 9.999999999999432/10 [04:49<00:00, 28.90s/it, Loss: 0.334061]


Valid Epoch: 10	Accuracy: 7805/10102 (77%)	
Test Epoch: 10	Accuracy: 3174/4890 (65%)

