In [1]:
import librosa
import numpy as np
import os
import soundfile

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.nn import functional as F

from torch.utils.data import TensorDataset, DataLoader
from scipy.special import softmax
from random import uniform
from math import cos, sin, pi

In [2]:
samples = 70000
n_mels = 128
h_features = 168
w_features = 137
num_audio = 13936

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
device

'cuda'

In [4]:
targets = open("./train/targets.tsv","r")
y_dict = {}

for line in targets:
    line = line.split()
    y_dict[line[0]] = int(line[1])

In [77]:
X = np.empty((num_audio, h_features, w_features), dtype=np.float64)
y = np.empty(num_audio, dtype=int)
i = 0

for file in os.listdir("./train"):
    filename = os.fsdecode(file)
    if filename.endswith(".wav"):
        audio, sr = librosa.load(f"./train/{filename}")
        
        #Trim silence
        if len(audio)> 0:
            audio, _ = librosa.effects.trim(audio)

        #Trim if audio length > samples 
        if len(audio) > samples:
            audio = audio[0:0+samples]
        
        #Else pad blanks if shorter 
        else:
            padding = samples - len(audio)
            offset = padding // 2
            audio = np.pad(audio, (offset, samples - len(audio) - offset), "mean")
        
        #Get Mel spectogram of audio
        spectrogram = librosa.feature.melspectrogram(y=audio,
                                                     sr=sr,
                                                     n_mels=n_mels)
        #Convert to log scale (DB)
        spectrogram = librosa.power_to_db(spectrogram)
        # print(spectrogram.shape, end=" ")
        
        #Get MFCC and second derivatives
        mfcc = librosa.feature.mfcc(S=spectrogram)
        
        delta2_mfcc = librosa.feature.delta(mfcc, order=2)
        
        #Append MFCC to spectrogram and flatten
        features = np.concatenate((spectrogram,mfcc,delta2_mfcc),axis=0)
        
        if i % 1000 == 0:
            print(f'Processed {i}')
        
        
        if i == 1024:
            break
        
        
        X[i] = features
        y[i] = y_dict[filename[:-4]]
        i += 1

Processed 0
Processed 1000


In [78]:
X = torch.tensor(X, dtype=torch.float32, device=device)
y = torch.tensor(y, dtype=torch.int64, device=device)
X = X.reshape((num_audio, 1, 168, 137))

In [79]:
X = X[:1024]
y = y[:1024]

In [55]:
X.shape, X.dtype, y.shape, y.dtype

(torch.Size([1024, 1, 168, 137]),
 torch.float32,
 torch.Size([1024]),
 torch.int64)

In [88]:
BATCH_SIZE = 256

train_dataset = TensorDataset(X, y)

train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True, num_workers=2)

loaders = {"train": train_dataloader}

In [90]:
X, y

(tensor([[[[-3.9163e+01, -3.8709e+01, -3.8502e+01,  ..., -3.2786e+01,
            -3.7995e+01, -3.8318e+01],
           [-4.8700e+01, -5.2733e+01, -5.2704e+01,  ..., -3.4232e+01,
            -4.6748e+01, -5.1451e+01],
           [-5.2726e+01, -5.2733e+01, -5.2733e+01,  ..., -2.9987e+01,
            -4.4160e+01, -5.2733e+01],
           ...,
           [ 2.0176e-02,  2.0176e-02,  2.0176e-02,  ..., -4.7059e-02,
            -4.7059e-02, -4.7059e-02],
           [-7.1335e-02, -7.1335e-02, -7.1335e-02,  ..., -4.0950e-01,
            -4.0950e-01, -4.0950e-01],
           [-8.5954e-02, -8.5954e-02, -8.5954e-02,  ...,  5.5621e-01,
             5.5621e-01,  5.5621e-01]]],
 
 
         [[[-3.1075e+01, -2.2716e+01, -2.5695e+01,  ..., -4.0714e+01,
            -4.5393e+01, -3.8303e+01],
           [-3.2313e+01, -2.0403e+01, -1.7515e+01,  ..., -3.0716e+01,
            -3.3296e+01, -3.0829e+01],
           [-3.5557e+01, -2.4091e+01, -2.1740e+01,  ..., -3.3820e+01,
            -3.5691e+01, -3.3292e+01

In [27]:
class Block(nn.Module):

    def __init__(self, in_channels, out_channels, identity_downsample=None, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        self.identity_downsample = identity_downsample

    def forward(self, x):
        identity = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)
        x += identity
        x = self.relu(x)
        return x

In [28]:
class ResNet_18(nn.Module):
    def __init__(self, image_channels, num_classes):
        super(ResNet_18, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        #resnet layers
        self.layer1 = self.__make_layer(64, 64, stride=1)
        self.layer2 = self.__make_layer(64, 128, stride=2)
        self.layer3 = self.__make_layer(128, 256, stride=2)
        self.layer4 = self.__make_layer(256, 512, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def __make_layer(self, in_channels, out_channels, stride):
        identity_downsample = None
        if stride != 1:
            identity_downsample = self.identity_downsample(in_channels, out_channels)

        return nn.Sequential(
            Block(in_channels, out_channels, identity_downsample=identity_downsample, stride=stride),
            Block(out_channels, out_channels)
        )

    def identity_downsample(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(out_channels)
        )

    def forward(self, x):

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.shape[0], -1)
        x = self.fc(x)
        return x

In [29]:
train_dataset[0][0].dtype, train_dataset[0][0].shape

(torch.float32, torch.Size([1, 168, 137]))

In [84]:
train_dataloader.dataset[0]

(tensor([[[-3.9163e+01, -3.8709e+01, -3.8502e+01,  ..., -3.2786e+01,
           -3.7995e+01, -3.8318e+01],
          [-4.8700e+01, -5.2733e+01, -5.2704e+01,  ..., -3.4232e+01,
           -4.6748e+01, -5.1451e+01],
          [-5.2726e+01, -5.2733e+01, -5.2733e+01,  ..., -2.9987e+01,
           -4.4160e+01, -5.2733e+01],
          ...,
          [ 2.0176e-02,  2.0176e-02,  2.0176e-02,  ..., -4.7059e-02,
           -4.7059e-02, -4.7059e-02],
          [-7.1335e-02, -7.1335e-02, -7.1335e-02,  ..., -4.0950e-01,
           -4.0950e-01, -4.0950e-01],
          [-8.5954e-02, -8.5954e-02, -8.5954e-02,  ...,  5.5621e-01,
            5.5621e-01,  5.5621e-01]]], device='cuda:0'),
 tensor(1, device='cuda:0'))

In [76]:
model = ResNet_18(1, 2)
batch = []
next(iter(train_dataloader))
# model.forward(batch[0])

[tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]]],
 
 
         [[[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]]],
 
 
         [[[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]]],
 
 
         ...,
 
 
         [[[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ..

In [31]:
def train_model(model, criterion, optimizer, max_epochs, lr_scheduler = None):
    accuracy = {"train": [], "valid": []}
    print(1, y)
    for epoch in range(max_epochs):
        print(2, y)
        for k, dataloader in loaders.items():
            print(3, y)
            epoch_correct = 0
            epoch_all = 0
            for x_batch, y_batch in dataloader:
                print(4, y)
                # x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                # y_batch = y_batch.type(torch.LongTensor).to(device)
                if k == "train":
                    model.train()
                    outp = model(x_batch)
                    optimizer.zero_grad()
                    loss = criterion(outp, y_batch)
                    loss.backward()
                    optimizer.step()
                else:
                    model.eval()
                    with torch.no_grad():
                        outp = model(x_batch)
                preds = outp.argmax(-1)
                correct = (y_batch == preds).sum().item()
                epoch_correct += correct
                epoch_all += BATCH_SIZE

            if k == "train":
                print(f"Epoch: {epoch+1}")
            print(f"Loader: {k}. Accuracy: {epoch_correct/epoch_all}")
            accuracy[k].append(epoch_correct/epoch_all)
        
        # if k == "train" and lr_scheduler is not None:
        #     lr_scheduler.step(epoch_correct/epoch_all)
            
    return accuracy["valid"]

In [58]:
X_,y_ = X.clone(),y.clone()

In [87]:
X,y = X_.clone(), y_.clone()

In [97]:
for x_batch, y_batch in train_dataloader:
    print(type(x_batch))
    # print(x_batch)

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [64]:
train_dataset.tensors[0]

tensor([[[[-3.9163e+01, -3.8709e+01, -3.8502e+01,  ..., -3.2786e+01,
           -3.7995e+01, -3.8318e+01],
          [-4.8700e+01, -5.2733e+01, -5.2704e+01,  ..., -3.4232e+01,
           -4.6748e+01, -5.1451e+01],
          [-5.2726e+01, -5.2733e+01, -5.2733e+01,  ..., -2.9987e+01,
           -4.4160e+01, -5.2733e+01],
          ...,
          [ 2.0176e-02,  2.0176e-02,  2.0176e-02,  ..., -4.7059e-02,
           -4.7059e-02, -4.7059e-02],
          [-7.1335e-02, -7.1335e-02, -7.1335e-02,  ..., -4.0950e-01,
           -4.0950e-01, -4.0950e-01],
          [-8.5954e-02, -8.5954e-02, -8.5954e-02,  ...,  5.5621e-01,
            5.5621e-01,  5.5621e-01]]],


        [[[-3.1075e+01, -2.2716e+01, -2.5695e+01,  ..., -4.0714e+01,
           -4.5393e+01, -3.8303e+01],
          [-3.2313e+01, -2.0403e+01, -1.7515e+01,  ..., -3.0716e+01,
           -3.3296e+01, -3.0829e+01],
          [-3.5557e+01, -2.4091e+01, -2.1740e+01,  ..., -3.3820e+01,
           -3.5691e+01, -3.3292e+01],
          ...,
   

In [51]:
X, y

(tensor([[[[-3.9163e+01, -3.8709e+01, -3.8502e+01,  ..., -3.2786e+01,
            -3.7995e+01, -3.8318e+01],
           [-4.8700e+01, -5.2733e+01, -5.2704e+01,  ..., -3.4232e+01,
            -4.6748e+01, -5.1451e+01],
           [-5.2726e+01, -5.2733e+01, -5.2733e+01,  ..., -2.9987e+01,
            -4.4160e+01, -5.2733e+01],
           ...,
           [ 2.0176e-02,  2.0176e-02,  2.0176e-02,  ..., -4.7059e-02,
            -4.7059e-02, -4.7059e-02],
           [-7.1335e-02, -7.1335e-02, -7.1335e-02,  ..., -4.0950e-01,
            -4.0950e-01, -4.0950e-01],
           [-8.5954e-02, -8.5954e-02, -8.5954e-02,  ...,  5.5621e-01,
             5.5621e-01,  5.5621e-01]]],
 
 
         [[[-3.1075e+01, -2.2716e+01, -2.5695e+01,  ..., -4.0714e+01,
            -4.5393e+01, -3.8303e+01],
           [-3.2313e+01, -2.0403e+01, -1.7515e+01,  ..., -3.0716e+01,
            -3.3296e+01, -3.0829e+01],
           [-3.5557e+01, -2.4091e+01, -2.1740e+01,  ..., -3.3820e+01,
            -3.5691e+01, -3.3292e+01

In [18]:
max_epochs = 1

model = ResNet_18(1, 2).to(device)
print(-1,y)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)
print(0,y)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)
train_model(model, criterion, optimizer, max_epochs)
print(4,y)

-1 tensor([1, 1, 1,  ..., 1, 0, 1], device='cuda:0')
0 tensor([1, 1, 1,  ..., 1, 0, 1], device='cuda:0')
1 tensor([1, 1, 1,  ..., 1, 0, 1], device='cuda:0')
2 tensor([1, 1, 1,  ..., 1, 0, 1], device='cuda:0')
3 tensor([1, 1, 1,  ..., 1, 0, 1], device='cuda:0')
4 tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')
4 tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')
4 tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')
4 tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')
Epoch: 1
Loader: train. Accuracy: 1.0
4 tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')


In [20]:
torch.cuda.empty_cache()
del model
del X
del y

In [15]:
train_model(model, criterion, optimizer, max_epochs)

Epoch: 1
Loader: train. Accuracy: 0.9360085227272728
Epoch: 2
Loader: train. Accuracy: 0.9414772727272728
Epoch: 3
Loader: train. Accuracy: 0.9467329545454546
Epoch: 4
Loader: train. Accuracy: 0.9499289772727273
Epoch: 5
Loader: train. Accuracy: 0.9530539772727272


[]

In [ ]:
train_model(model, criterion, optimizer, 20)

In [72]:
num_audio = 3413

X = np.empty((num_audio, h_features, w_features), dtype=np.float64)
y_name = []
i = 0

for file in os.listdir("./test"):
    filename = os.fsdecode(file)
    if filename.endswith(".wav"):
        audio, sr = librosa.load(f"./test/{filename}")

        #Trim silence
        if len(audio)> 0:
            audio, _ = librosa.effects.trim(audio)

        #Trim if audio length > samples 
        if len(audio) > samples:
            audio = audio[0:0+samples]

        #Else pad blanks if shorter 
        else:
            padding = samples - len(audio)
            offset = padding // 2
            audio = np.pad(audio, (offset, samples - len(audio) - offset), "mean")

        #Get Mel spectogram of audio
        spectrogram = librosa.feature.melspectrogram(y=audio,
                                                     sr=sr,
                                                     n_mels=n_mels)
        #Convert to log scale (DB)
        spectrogram = librosa.power_to_db(spectrogram)
        # print(spectrogram.shape, end=" ")

        #Get MFCC and second derivatives
        mfcc = librosa.feature.mfcc(S=spectrogram)

        delta2_mfcc = librosa.feature.delta(mfcc, order=2)

        #Append MFCC to spectrogram and flatten
        features = np.concatenate((spectrogram,mfcc,delta2_mfcc),axis=0)

        if i % 1000 == 0:
            print(f'Processed {i}')
            print(filename)
        X[i] = features
        y_name.append(filename[:-4])
        # y_name[i] = filename[:-4]
        i += 1

Processed 0
00100026dbdffcd01cde6ee9b9a9d273.wav
Processed 1000
4e7f12b9dd85e72c30894cbefeba6bb6.wav
Processed 2000
9767325a483f8943f729495a2607500e.wav
Processed 3000
e2369bcf621e0e46eecb1e74affbfd92.wav


In [173]:
len(y_name), y.shape

(3413, torch.Size([13936]))

In [73]:
X = torch.tensor(X, dtype=torch.float32)
# y = torch.tensor(y, dtype=torch.int, device=device)
X = X.reshape((num_audio, 1, 168, 137))


In [15]:
test_dataset = TensorDataset(X)

test_dataloader = DataLoader(test_dataset, batch_size = 1, shuffle=False, num_workers=2)

In [75]:
torch.save(model.state_dict(), "./model_safe")

In [14]:
device = torch.device('cpu')
model = ResNet_18(1, 2)
model.load_state_dict(torch.load("./model_safe", map_location=device))

<All keys matched successfully>

In [16]:
y = []
for x_batch in test_dataloader:
    model.eval()
    with torch.no_grad():
        x_batch= x_batch[0].to(device)
        y.append(model(x_batch))
    # print(x_batch.shape)

In [77]:
len(y)

3413

In [17]:
f = open("answer.txt", "w")
for i in range(num_audio):
    f.write(f'{y[i].argmax(-1).item()}\n')
f.close()

In [78]:
f = open("answer.txt", "w")
for i in range(num_audio):
    f.write(f'{y_name[i]} {y[i].argmax(-1).item()}\n')
f.close()