In [1]:
import os
os.chdir('..')

In [2]:
import librosa
import torch
import torch.nn as nn
import torch.functional as F 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader

In [3]:
UrbarnSound8k_path = r"dataset\UrbanSound8K"

In [4]:
df = pd.read_csv(r"dataset\UrbanSound8K\metadata\UrbanSound8K.csv")
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [5]:
classes = {
    0: 'air_conditioner',
    1: 'car_horn',
    2: 'children_playing',
    3: 'dog_bark',
    4: 'drilling',
    5: 'engine_idling',
    6: 'gun_shot',
    7: 'jackhammer',
    8: 'siren',
    9: 'street_music'
}

In [6]:
def features_extract(file):
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast')
    feature = librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=50)
    scaled_feature = np.mean(feature.T,axis=0)
    return scaled_feature

In [7]:
feature_parsing = []
for idx in tqdm(range(len(df))):
    file_name = df.iloc[idx]['slice_file_name']
    fold_number = df.iloc[idx]['fold']
    fold = 'fold' + str(fold_number) 
    file_path = os.path.join(UrbarnSound8k_path, 'audio', fold, file_name)
    feature_parsing.append(features_extract(file_path))

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
100%|██████████| 8732/8732 [05:22<00:00, 27.12it/s]


In [8]:
df['feature'] = feature_parsing

In [9]:
class AudioDataLoader():
    def __init__(self, data_set):
        self.data = data_set
        self.data_len = len(self.data)
    def __len__(self):
        return self.data_len
    def __getitem__(self, idx):
        feature = torch.tensor(self.data.iloc[idx]['feature'])
        label = torch.tensor(self.data.iloc[idx]['classID'])
        label = label.long()
        return feature, label

In [10]:
def get_dataloader_fold(valid_fold):
    train_df = df[df['fold'] != valid_fold]
    valid_df = df[df['fold'] == valid_fold]
    train_loader = AudioDataLoader(train_df)
    train_loader = DataLoader(train_loader, batch_size=32)
    val_loader = AudioDataLoader(valid_df)
    val_loader = DataLoader(val_loader, batch_size=32)
    return train_loader, val_loader

In [11]:
class DenseLayer(nn.Module):
    def __init__(self, in_features, out_features, drop_rate=0.2):
        super(DenseLayer, self).__init__()
        self.fc = nn.Linear(in_features, out_features)
        self.dropout = nn.Dropout(drop_rate)
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.fc(x)
        out = self.relu(out)
        out = self.dropout(out)
        return out

class SimpleAudioClassifier(nn.Module):
    def __init__(self, num_feature, num_labels):
        super(SimpleAudioClassifier, self).__init__()
        self.num_labels = num_labels
        self.fc1 = DenseLayer(num_feature, 128)
        self.fc2 = DenseLayer(128, 256)
        self.fc3 = DenseLayer(256, 512)
        self.fc4 = DenseLayer(512, 512)
        self.fc5 = DenseLayer(512, 256)
        self.fc6 = DenseLayer(256, 128)
        self.fc7 = nn.Linear(128, num_labels)
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        x = self.fc5(x)
        x = self.fc6(x)
        x = self.fc7(x)
        return x

In [12]:
def init_weight(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.constant_(m.bias, 0)

In [13]:
def cal_accuracy(pred, label):
    pred_label = torch.argmax(pred, 1)
    correct = (pred_label == label).sum().item()
    return correct / len(label)

In [14]:
def train(model, data_loader, optimizer, criterion):
    for idx, (feature, label) in enumerate(data_loader):
        feature = feature.to('cuda')
        label = label.to('cuda')
        optimizer.zero_grad()
        output = model(feature)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        acc = cal_accuracy(output, label)
    return acc
def eval(model, data_loader, optimizer, criterion):
    for idx, (feature, label) in enumerate(data_loader):
        feature = feature.to('cuda')
        label = label.to('cuda')
        output = model(feature)
        loss = criterion(output, label)
        acc = cal_accuracy(output, label)
    return acc

In [15]:
def fold_cycle(valid_fold):
    print('Train and validate on fold {}'.format(valid_fold))
    train_loader, valid_loader = get_dataloader_fold(valid_fold)
    model = SimpleAudioClassifier(num_feature=50, num_labels=10).to('cuda')
    model.apply(init_weight)
    # print numbers of parameters
    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('Number of parameters: {}'.format(params))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss()
    count = 0
    for epoch in tqdm(range(50)):
        train_acc = train(model, train_loader, optimizer, criterion)
        if train_acc >=0.9:
            count +=1
        else:
            count = 0
        if count == 3:
            break
    eval_acc = eval(model, valid_loader, optimizer, criterion)
    print(eval_acc)
    return eval_acc

In [16]:
eval_total_acc = []
for i in range(1, 11):
    eval_total_acc.append(fold_cycle(i))
    print('\n')


Train and validate on fold 1
Number of parameters: 599306


100%|██████████| 50/50 [02:09<00:00,  2.59s/it]


0.6666666666666666


Train and validate on fold 2
Number of parameters: 599306


100%|██████████| 50/50 [02:11<00:00,  2.64s/it]


0.7083333333333334


Train and validate on fold 3
Number of parameters: 599306


100%|██████████| 50/50 [02:10<00:00,  2.60s/it]


0.3793103448275862


Train and validate on fold 4
Number of parameters: 599306


100%|██████████| 50/50 [02:03<00:00,  2.48s/it]


0.43333333333333335


Train and validate on fold 5
Number of parameters: 599306


 94%|█████████▍| 47/50 [01:59<00:07,  2.55s/it]


0.375


Train and validate on fold 6
Number of parameters: 599306


100%|██████████| 50/50 [02:06<00:00,  2.53s/it]


0.5652173913043478


Train and validate on fold 7
Number of parameters: 599306


100%|██████████| 50/50 [02:04<00:00,  2.50s/it]


0.3333333333333333


Train and validate on fold 8
Number of parameters: 599306


100%|██████████| 50/50 [02:08<00:00,  2.56s/it]


0.6666666666666666


Train and validate on fold 9
Number of parameters: 599306


 92%|█████████▏| 46/50 [02:00<00:10,  2.62s/it]


0.4375


Train and validate on fold 10
Number of parameters: 599306


100%|██████████| 50/50 [02:07<00:00,  2.56s/it]

1.0







In [20]:
m = 0.6666666666666666 + 0.7083333333333334 + 0.3793103448275862 + 0.43333333333333335 + 0.375 + 0.5652173913043478 + 0.3333333333333333 + 0.6666666666666666 + 0.4375 + 1.0
m/10


0.5565361069465268