In [1]:
import numpy as np
import sounddevice as sd
import time
from IPython.display import Audio 
from matplotlib import pyplot as plt
import os
import pandas as pd

In [80]:
# https://medium.com/analytics-vidhya/pytorch-for-deep-learning-lstm-for-sequence-data-d0708fdf5717

from torch.utils.data import Dataset
from torchvision import transforms
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from tqdm import tqdm

data_folder = "/Users/will/Documents/COM4511/ass/COM4511/task4VAD/audio"
labs_folder = "/Users/will/Documents/COM4511/ass/COM4511/task4VAD/labels"

training_prefixes = ["N", "V"]
validation_prefixes = ["E"]
testing_prefixes = ["C"]

# reading training data

def read_data_to_list(prefixes):
    data_out = []
    labs_out = []
    os.chdir(data_folder)
    for file in os.listdir():
        if file[0] in prefixes:
            path = f"{data_folder}/{file}"
            with open(path, 'rb') as f:
                data_out.append(np.load(f))
    
    os.chdir(labs_folder)
    for file in os.listdir():
        if file[0] in prefixes:
            path = f"{labs_folder}/{file}"
            with open(path, 'rb') as f:
                labs_out.append(np.load(f))
                
    return data_out, labs_out
    
    
training_data, training_labs =  read_data_to_list(training_prefixes)

testing_data, testing_labs =  read_data_to_list(testing_prefixes)
validation_data, validation_labs =  read_data_to_list(validation_prefixes)


class Timeseries(Dataset):
    def __init__(self, x, y, seq_len):
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)
        self.len = x.shape[0]
        self.seq_len = seq_len
    
    # def __getitem__(self, idx):
    #     return self.transform(self.x[idx]), self.transform(self.y[idx])

    def __getitem__(self, idx):
        return self.x[idx:idx+self.seq_len], torch.tensor(self.y[idx:idx+self.seq_len], dtype=torch.float32)


    def __len__(self):
        return self.len - (self.seq_len - 1)

sequence_length = 2048

train_datasets = [Timeseries(x, y, seq_len=sequence_length) for (x, y) in zip(training_data, training_labs)]

from torch.utils.data import DataLoader

train_loader = DataLoader(torch.utils.data.ConcatDataset(train_datasets), shuffle=True, batch_size=512) 

train_loader

<torch.utils.data.dataloader.DataLoader at 0x12d561b20>

In [81]:


torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cpu


In [86]:
# lstm = nn.LSTM(13, 1) # input MFCCs are 13 dim, output is 1 dim as it is binary classification
# hidden = (torch.randn(1, 1, 13),
#           torch.randn(1, 1, 1))



class LSTM_network(nn.Module):
    def __init__(self):
        super(LSTM_network,self).__init__()
        self.lstm = nn.LSTM(input_size=13,hidden_size=5,num_layers=1,batch_first=True)
        self.fc1 = nn.Linear(in_features=5,out_features=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        output,_status = self.lstm(x)
        output = self.fc1(torch.relu(output))
        return self.sigmoid(output)

model = LSTM_network()

In [92]:
crit = nn.BCELoss() # binary classfication task, BCEL is obvious choice.
optimiser = optim.Adam(model.parameters(), lr=0.0005) # https://deepdatascience.wordpress.com/2016/11/18/which-lstm-optimizer-to-use/
epochs = 1

In [93]:
# training løøp
for e in range(epochs):
    data_shape = 1048576
    for i, data in tqdm(enumerate(train_loader)):
        
        
        y_pred = model(data[0][:]).reshape(-1, 1)
        # .reshape(data_shape)
        loss = crit(y_pred, data[:][1].reshape(-1, 1))
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        if i % 100 == 0:
            print(f"{i}th data sample, loss: {loss}")
    print(f"{e}th epoch, loss: {loss}")

  return self.x[idx:idx+self.seq_len], torch.tensor(self.y[idx:idx+self.seq_len], dtype=torch.float32)
1it [00:02,  2.07s/it]

0th data sample, loss: 1.0024621486663818


101it [02:25,  1.25s/it]

100th data sample, loss: 0.8601197004318237


201it [04:42,  1.24s/it]

200th data sample, loss: 0.6748944520950317


301it [07:07,  1.72s/it]

300th data sample, loss: 0.5558712482452393


382it [09:17,  1.46s/it]


KeyboardInterrupt: 