In [1]:
import os
import os.path
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import librosa
import random
import warnings

warnings.filterwarnings(action='ignore')

from torch.utils.data import Dataset
from tqdm import tqdm
from functools import lru_cache
#from torchvision.datasets.utils import makedir_exist_ok

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

random.seed(777)
torch.manual_seed(777)

if device == 'cuda':
    torch.cuda.manual_seed_all(777)

cuda


In [3]:
learning_rate = 0.0001
training_epochs = 100
batch_size = 4

In [4]:
loss = nn.CrossEntropyLoss().to(device)

input = torch.randn(3, 5, requires_grad=True)

print(input.size(), input)


# target = torch.empty(3, dtype=torch.long).random_(5)
target = torch.empty(3, dtype=torch.long).random_(5)
target_onehot = torch.nn.functional.one_hot(target, num_classes=5).float()

print(target.size(), target)
print(target_onehot.size(), target_onehot)

output = loss(input, target)
output.backward()

print("#"*10)

loss_bce = nn.BCEWithLogitsLoss().to(device)
output = loss_bce(input, target_onehot)
output.backward()

torch.Size([3, 5]) tensor([[-0.4015, -0.5934,  1.6885,  0.5554, -0.9433],
        [-0.6758,  1.4425, -1.4711, -1.3173,  2.3271],
        [ 0.0894,  0.6558, -0.5836, -0.1074,  0.0367]], requires_grad=True)
torch.Size([3]) tensor([3, 0, 0])
torch.Size([3, 5]) tensor([[0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]])
##########


### 1. 데이터 준비

#### (1) Dataset

"""
        BS = 3개
        
        [
          (1_sound_feature, 1_target),   ==> ([1_Seq_len, 40], 3)
          (2_sound_feature, 2_target)    ==> ([2_Seq_len, 40], 4)
          (3_sound_feature, 3_target)    ==> ([3_Seq_len, 40], 1)
        ]
        
        Batched Sample
        batched_X = [
                      [[40], [40], [40], [40], [40]]
                      [[40], [40], [??], [??], [??]] # ?? 일반적으로 zero padding  (제로 패딩)
                      [[40], [??], [??], [??], [??]]
                    ]
                    
        shape = (Batch_size=3, Seq_len = [5, 2, 1], Feat_dim=40)  # 길이 고정: max 길이로 고정함
        
        batched_Y = [3, 4, 1]        
        
        """

        [(s1_f, s1_t), (s2_f, s2_t), (s3_f, s3_t), ...., (s16_f, s16_t)]
        
        [
            [1, 2, 3, 4, ,...                 10] 
            [2, 3, 4, , [40], ,
            [5, 2]
        ]
        
        ==> 
            
        오디오 spectrogram에서 제로의 숫자는 무슨 의미일까요?
        제로 패딩의 방법으로 하는게 적절한가? 
            
        제로(임의 단어) 패딩 자연어 처리 
        
            
        간단히 제로 패딩 해결하는 방법 ==> BS= 1
        BS=1    ???
        BS=16  ??? 
        (1) 결과물이 다르다 (2) 학습 속도가 다르다        
         결과물의 차이가 큰 차이가 없으면 
            
        [
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            [2, 3, 4, [0,..., 0], 0 ,0 ,0 ,0 ,0 ,0 ],
            [5, 3, 0, 0 ,                   ]
        
        ]
        
        입력 문장 
        [
            [w1, w2, w3],
            [w2]
        ]
        
        ==> 
         [
            [1, 2,      3, </s>],
            [2, </s>, 0 , 0    ]
        ]

In [5]:
class UrbanSound8K(Dataset):

    classes = [
                (0, 'air_conditioner'),
                (1, 'car_horn'),
                (2, 'children_playing'),
                (3, 'dog_bark'),
                (4, 'drilling'),
                (5, 'engine_idling'),
                (6, 'gun_shot'),
                (7, 'jackhammer'),
                (8, 'siren'),
                (9, 'street_music')
            ]
    
    def __init__(self, root, train=True, fold_list=None):
        
        super(UrbanSound8K, self).__init__()
        
        self.root = root
        self.train = train
        
        self.metadata = self._read_metadata(fold_list)

    @lru_cache(maxsize=100000)
    def __getitem__(self, index):
        file_name = self.metadata['file_names'][index]
        label = self.metadata['labels'][index]
        folder = self.metadata['folders'][index]
        
        wav_file_path = os.path.join(self.audio_folder,
                                     "fold{}".format(folder),
                                     file_name)
            
        sound, sample_rate= librosa.load(wav_file_path, sr=16000, mono=True, res_type='kaiser_fast')
        melspec = librosa.feature.melspectrogram(y=sound, sr=sample_rate, n_mels=40)
        
        log_melspec = librosa.power_to_db(melspec, ref=np.max)
        
        # print("before : ", log_melspec.shape)
        
        log_melspec = log_melspec.T
        # log_melspec = log_melspec.transpose()
        # print("after : ", log_melspec.shape)       
        # log_melspec = np.mean(log_melspec.T, axis=0)
        
        sound_feature = torch.FloatTensor(log_melspec)
        target = label

        return sound_feature, target # 
    
    def __len__(self):
        return len(self.metadata['file_names']) # [:]
    
    @property
    def audio_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'audio')
    
    @property
    def metadata_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'metadata')
    
    @property
    def class_to_idx(self):
        return {_class: i for i, (cid, _class) in enumerate(self.classes)}
    
    @property
    def idx_to_class(self):
        return {i: _class for i, (cid, _class) in enumerate(self.classes)}
    
    def _read_metadata(self, fold_list):
        csv_path = os.path.join(self.metadata_folder, self.__class__.__name__+".csv")
        
        print(csv_path)
        
        metadata = {}
        
        file_names = []
        labels = []
        folders = []
        
        if os.path.exists(csv_path):
            csvData = pd.read_csv(csv_path)
            # print(csvData)
            
            for i in range(0,len(csvData)):
                if csvData.iloc[i, 5] in fold_list:
                    file_names.append(csvData.iloc[i, 0])
                    labels.append(csvData.iloc[i, 6])
                    folders.append(csvData.iloc[i, 5])
            
            metadata['file_names'] = file_names
            metadata['labels'] = labels
            metadata['folders'] = folders
            
        else:
            raise RuntimeError('Metadata(csv format) not found.')
        
        return metadata

In [6]:
def audio_collate_fn(batch):
    # print(batch)
    
    # batch = [(x1, y1), (x2, y2), (x3, y3), (x4, y4)]
    # print("test: ", batch[0][0].size(), batch[0][1])
    
    batch_X = [x for x, y in batch]  # ==> [x1, x2, x3, x4]
    
#     print("="*20)
#     for idx, x in enumerate(batch_X):
#         print(idx, x.size())
#     print("="*20)
    
    padded_batch_X = torch.nn.utils.rnn.pad_sequence(batch_X, batch_first=True, padding_value=0.0)
    # print("padded_batch_X: ", padded_batch_X.size(), padded_batch_X)

    batch_Y = [y for x, y in batch]
    # print("batch_Y: ", batch_Y)
    # padded_batch_Y = torch.nn.utils.rnn.pad_sequence(batch_Y, batch_first=True)
    # print("padded_Y: {}".format(padded_Y))
    
    return padded_batch_X, torch.LongTensor(batch_Y)

In [7]:
start = time.time()
urbansound8k_train = UrbanSound8K(root='./data',
                                  train=True,
                                  fold_list=[1, 2, 3, 4, 5, 6, 7, 8, 9])
end = time.time()
print("Prepare (Train Data Set): %.4f (sec.)" % (end-start))

start = time.time()
urbansound8k_test = UrbanSound8K(root='./data',
                                 train=False,
                                 fold_list=[10])
end = time.time()
print("Prepare (Test Data Set): %.4f (sec.)" % (end-start))

./data\UrbanSound8K\metadata\UrbanSound8K.csv
Prepare (Train Data Set): 0.5726 (sec.)
./data\UrbanSound8K\metadata\UrbanSound8K.csv
Prepare (Test Data Set): 0.1960 (sec.)


#### (2) DataLoader

In [8]:
train_loader = torch.utils.data.DataLoader(dataset=urbansound8k_train,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           collate_fn=audio_collate_fn,
                                           drop_last=False)

test_loader = torch.utils.data.DataLoader(dataset=urbansound8k_test,
                                           batch_size=batch_size,
                                           shuffle=False,
                                           collate_fn=audio_collate_fn,
                                           drop_last=False)

In [9]:
for X, Y in train_loader:
    print(X.size(), Y.size(), Y)
    break
    
print(len(train_loader))

torch.Size([4, 126, 40]) torch.Size([4]) tensor([4, 0, 5, 0])
1974


### 2. 모델 (torch.nn.Module)

In [10]:
class UrbanSoundRNN(torch.nn.Module):
    def __init__(self, feat_dim, hidden_dim, num_class):
        super().__init__()

        self.rnn = torch.nn.LSTM(input_size=feat_dim,
                                 hidden_size=hidden_dim,
                                 num_layers=2,
                                 batch_first=True, # batch_first=False 
                                 dropout=0.2,
                                 bidirectional=False)
            
        rnn_output_dim = 2*hidden_dim if self.rnn.bidirectional else hidden_dim
        
        self.fc = torch.nn.Linear(rnn_output_dim, num_class)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output)
        
        # print(output.size())
        # print(output[:, -1, :].size())
        return output[:, -1, :]
    
model = UrbanSoundRNN(feat_dim=40, hidden_dim=64, num_class=10).to(device)

In [11]:
print(model)

UrbanSoundRNN(
  (rnn): LSTM(40, 64, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=64, out_features=10, bias=True)
)


In [12]:
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01)

### 3. 학습

        Y = [2, 3, 4, ..., 9]  # (0 ~ 9)
        
        
        _Y = [
                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],                # (내부에서) one-hot encoding 바꾸자 
                [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
            
              ] # (BS, 10)
        
        hypotheis_shape = (BS, 10)
        
        hypothesis = [
                          [0,      0,   1,   0, 0, 0, 0, 0, 0, 0]
                        [-1.2, 2.0, 8.0, 4.1, 2.2, -1.2, 2.0, 3.0, 4.1, 2.2],
        ]
        
    
          hyp.argmax(1) = [2, 4, .... , 9]
        
        cost = 모든 모든 샘플에 대해서,
        
                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
        
        logit = [-1.2, 2.0, 8.0, 4.1, 2.2, -1.2, 2.0, 3.0, 4.1, 2.2]
        
        y_hyp = softmax(logit)
              = [0.0, 0.1, 0.3, 0.25, .....]
        
        
        cost = - sum( y*log(y_hyp)) => 0*log(0.0) + 0*log(0.1) + 1*log(0.9) + 0* 
               
        [1, 2, 3, 4] ==> 
        
        [
            [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
            []
            []
            []
        ]

In [13]:
# training_epochs = 2

total_batch = len(train_loader)
# print(total_batch)

start_time = time.time()
print('Learning started.')

for epoch in range(training_epochs):
    train_loss = 0.
    train_acc = 0.
   
    test_loss = 0.
    test_acc = 0.
    
    model.train()
    
    for i, (X, Y) in enumerate(train_loader):
        
        # print(X.size(), Y.size(), type(Y))
        # break
        
        X = X.to(device)
        Y = Y.to(device)

        optimizer.zero_grad()
        hypothesis = model(X)
#         print("HYP: ", hypothesis.size())
#         print("Y  : ", Y.size())
#         input()
        # break
        
        loss = criterion(hypothesis, Y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_acc += (hypothesis.argmax(1) == Y).sum().item()
        
        if epoch == 0:
            secs = int(time.time() - start_time)
            et = int((len(train_loader) - i) * (secs/(i+1)))
            print("(train) : {:4d}/{:4d} # estimated end time: {:10d} (sec.)".format(i+1, len(train_loader), et), end="\r", flush=True)
        
    train_loss = train_loss / (len(train_loader)* batch_size)
    train_acc  = train_acc / (len(train_loader)* batch_size)
    
    start_time_eval = time.time()
    model.eval()
    
    for i, (X, Y) in enumerate(test_loader):
        X = X.to(device)
        Y = Y.to(device)

        with torch.no_grad():
            pred = model(X)
            loss = criterion(pred, Y)

            test_loss += loss.item()
            test_acc += (pred.argmax(1) == Y).sum().item()
        
        if epoch == 0:
            secs = int(time.time() - start_time_eval)
            et = int((len(test_loader) - i) * (secs/(i+1)))
            print("(test) : {:4d}/{:4d} # estimated end time: {:10d} (sec.)".format(i+1, len(test_loader), et), end="\r", flush=True)
            
    test_loss = test_loss / (len(test_loader)* batch_size)
    test_acc  = test_acc / (len(test_loader)* batch_size)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | ", "time in %d minutes, %d seconds" %(mins, secs))
    print('Epoch: %d' %(epoch + 1), " | ", f'(train)\tLoss: {train_loss:.8f}\t|\tAcc: {train_acc * 100:.2f}%')
    print('Epoch: %d' %(epoch + 1), " | ", f'(test)\tLoss: {test_loss:.8f}\t|\tAcc: {test_acc * 100:.2f}%')
    print()
    
print('Learning finished')

Learning started.
Epoch: 1  |  time in 5 minutes, 6 seconds         0 (sec.))
Epoch: 1  |  (train)	Loss: 0.49861006	|	Acc: 27.33%
Epoch: 1  |  (test)	Loss: 0.44593228	|	Acc: 35.60%

Epoch: 2  |  time in 5 minutes, 23 seconds
Epoch: 2  |  (train)	Loss: 0.41625429	|	Acc: 40.06%
Epoch: 2  |  (test)	Loss: 0.41355031	|	Acc: 41.07%

Epoch: 3  |  time in 5 minutes, 41 seconds
Epoch: 3  |  (train)	Loss: 0.38505458	|	Acc: 42.77%
Epoch: 3  |  (test)	Loss: 0.41201097	|	Acc: 41.19%

Epoch: 4  |  time in 5 minutes, 57 seconds
Epoch: 4  |  (train)	Loss: 0.36820044	|	Acc: 45.42%
Epoch: 4  |  (test)	Loss: 0.38417386	|	Acc: 44.05%

Epoch: 5  |  time in 6 minutes, 14 seconds
Epoch: 5  |  (train)	Loss: 0.35875480	|	Acc: 46.67%
Epoch: 5  |  (test)	Loss: 0.49197984	|	Acc: 29.76%

Epoch: 6  |  time in 6 minutes, 30 seconds
Epoch: 6  |  (train)	Loss: 0.35450659	|	Acc: 47.92%
Epoch: 6  |  (test)	Loss: 0.40113971	|	Acc: 42.26%

Epoch: 7  |  time in 6 minutes, 46 seconds
Epoch: 7  |  (train)	Loss: 0.34594213	|	

Epoch: 56  |  time in 20 minutes, 49 seconds
Epoch: 56  |  (train)	Loss: 0.25218128	|	Acc: 65.67%
Epoch: 56  |  (test)	Loss: 0.34859731	|	Acc: 53.93%

Epoch: 57  |  time in 21 minutes, 7 seconds
Epoch: 57  |  (train)	Loss: 0.24290009	|	Acc: 67.22%
Epoch: 57  |  (test)	Loss: 0.37768529	|	Acc: 47.02%

Epoch: 58  |  time in 21 minutes, 24 seconds
Epoch: 58  |  (train)	Loss: 0.25812467	|	Acc: 64.39%
Epoch: 58  |  (test)	Loss: 0.34111225	|	Acc: 56.43%

Epoch: 59  |  time in 21 minutes, 42 seconds
Epoch: 59  |  (train)	Loss: 0.24029488	|	Acc: 66.88%
Epoch: 59  |  (test)	Loss: 0.35685009	|	Acc: 51.43%

Epoch: 60  |  time in 21 minutes, 59 seconds
Epoch: 60  |  (train)	Loss: 0.24722873	|	Acc: 65.79%
Epoch: 60  |  (test)	Loss: 0.35023278	|	Acc: 53.45%

Epoch: 61  |  time in 22 minutes, 17 seconds
Epoch: 61  |  (train)	Loss: 0.23513603	|	Acc: 67.77%
Epoch: 61  |  (test)	Loss: 0.33783776	|	Acc: 57.86%

Epoch: 62  |  time in 22 minutes, 34 seconds
Epoch: 62  |  (train)	Loss: 0.23423728	|	Acc: 67.9

### 4. 예측

In [14]:
r = random.randint(0, len(urbansound8k_test) - 1)

print('Random One: ', r)

X, Y = urbansound8k_test[r]
print(X.size())

with torch.no_grad():
    seq_len, feat_dim = X.size()
    X = X.view(-1, seq_len, feat_dim).to(device)
    
    pred = model(X)
    
    print(pred)
    print(Y, pred.argmax(1).item())
    
    correct_label = urbansound8k_test.idx_to_class[Y]
    pred_label = urbansound8k_test.idx_to_class[pred.argmax(1).item()]
    print(correct_label, pred_label)
    
    
    file_name = urbansound8k_test.metadata['file_names'][r]
    label = urbansound8k_test.metadata['labels'][r]
    folder = urbansound8k_test.metadata['folders'][r]
        
    wav_file_path = os.path.join(urbansound8k_test.audio_folder,
                                 "fold{}".format(folder),
                                 file_name)
    print(wav_file_path)

Random One:  234
torch.Size([126, 40])
tensor([[ 3.6089, -2.1406,  0.7250, -1.0899,  1.3096,  1.3269, -2.5937,  0.2028,
         -1.0116, -0.3977]], device='cuda:0')
7 0
jackhammer air_conditioner
./data\UrbanSound8K\audio\fold10\162134-7-10-3.wav


In [15]:
print(wav_file_path)

import IPython.display as ipd

ipd.Audio(wav_file_path)

./data\UrbanSound8K\audio\fold10\162134-7-10-3.wav
