# Install library

In [2]:
# !pip install scipy
# !pip install spafe

# Import

In [3]:
import librosa 
import librosa.display as dsp
from IPython.display import Audio

import scipy.io.wavfile
from spafe.features.gfcc import gfcc

import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time
import random

from sklearn.metrics import classification_report
import torch.nn.functional as F
from sklearn.metrics import log_loss
from sklearn import metrics
import math

import torch
from torch.utils.data import DataLoader, Dataset
import torchvision.datasets as datasets  
import torchvision.transforms as transforms
from tqdm.auto import tqdm
import torch.nn as nn


import torch.optim as optim  


# GPU Setting

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') #GPU 할당
print(device)

cuda


# Make Log/Model Folder

In [5]:
os.makedirs('./audio_logs', exist_ok=True)
os.makedirs('./audio_model', exist_ok=True)

# Fixed RandomSeed

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

# Hyperparameter Setting
## USER can change options through this line

In [7]:
sr = 16000 #USER OPTION : 8000 / 16000
f_method = 'mfcc' #USER OPTION : mfcc / mel / logmel / gtcc

print(f_method, sr)

mfcc 16000


In [8]:
#this is class configuration
#you can change hyperparameters here
class CFG:    
    sample_rate = sr #set the sampling rate
    num_epochs = 30 #set the epochs
    batch_size = 10 #set the batch size
    learning_rate = 0.001 #set the learning rate
    kernel_name = f'{f_method}_{sr}' #set kernel name for pth
    DEBUG = False #choose debug mode

# Customdataset

The f_method & sr value selected in the above cell determines the feature extraction method & sampling rate.

In [9]:
class CustomDataset(Dataset):
    def __init__(self, file_path, f_method, train_mode=True, transforms=None): 
        self.csv = pd.read_csv(file_path)
        self.train_mode = train_mode
        self.transforms = transforms        

    def __len__(self):
        if CFG.DEBUG:
            return 10
        else:
            return self.csv.shape[0]     
    
    def __getitem__(self, index): 
        file_name = self.csv.iloc[index].file_name 
        audio, sr = librosa.load(f'../../dataset/audio/{file_name}', CFG.sample_rate)
        
        if f_method == 'mfcc':
            audio = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)  
        
        if f_method == 'mel':
            audio = librosa.feature.melspectrogram(audio, sr)
            
        if f_method == 'log_mel':
            audio = librosa.feature.melspectrogram(audio, sr)
            audio = librosa.power_to_db(audio)
       
        if f_method == 'gtcc':
            fs, sig = scipy.io.wavfile.read(f'../../dataset/audio/{file_name}')
            audio = gfcc(sig, num_ceps=200, nfilts=200, fs = CFG.sample_rate)
                    
        audio = torch.tensor(audio).float()
        audio = audio.reshape(audio.shape[0], audio.shape[1], 1)  
        
        label = self.csv.iloc[index].label 
        
        if self.transforms is not None:  
            audio = self.transforms(audio)

        return audio, label  

## Dataloader 

In [10]:
train_dataset = CustomDataset('../../dataset/train.csv', f_method, train_mode = True, transforms = None) 
train_loader = DataLoader(train_dataset, batch_size = CFG.batch_size, shuffle=True, num_workers=0)

valid_dataset = CustomDataset('../../dataset/valid.csv', f_method, train_mode = False, transforms = None)
valid_loader = DataLoader(valid_dataset, batch_size = CFG.batch_size, shuffle=False, num_workers=0)

In [11]:
train_batches = len(train_loader)
valid_batches = len(valid_loader)

print('total train batches :', train_batches)
print('total valid batches :', valid_batches)

total train batches : 230
total valid batches : 77


# Model Define

In [17]:
class CNNclassification(torch.nn.Module):
    def __init__(self):
        super(CNNclassification, self).__init__()
        self.layer1 = torch.nn.Sequential(
            nn.Conv2d(40, 10, kernel_size=2, stride=1, padding=1), #cnn layer
            nn.ReLU(), #activation function
            nn.MaxPool2d(kernel_size=2, stride=2)) #pooling layer
        
        self.layer2 = torch.nn.Sequential(
            nn.Conv2d(10, 100, kernel_size=2, stride=1, padding=1), #cnn layer
            nn.ReLU(), #activation function
            nn.MaxPool2d(kernel_size=2, stride=2)) #pooling layer
        
        self.layer3 = torch.nn.Sequential(
            nn.Conv2d(100, 200, kernel_size=2, stride=1, padding=1), #cnn layer
            nn.ReLU(), #activation function
            nn.MaxPool2d(kernel_size=2, stride=2)) #pooling layer 
        
        self.fc_layer = nn.Sequential( 
            nn.Linear(2400, 2) #fully connected layer(ouput layer)
           
        )    
        
    def forward(self, x):

        x = self.layer1(x) 
        
        x = self.layer2(x) 
        
        x = self.layer3(x)
        
        x = torch.flatten(x, start_dim=1) 

        out = self.fc_layer(x)
        return out

# Train/Validataion

In [18]:
model = CNNclassification().to(device)
criterion = torch.nn.CrossEntropyLoss().to(device) 
optimizer = optim.Adam(model.parameters(), lr=CFG.learning_rate)
scheduler = None 

In [19]:
from tqdm.auto import tqdm
def train(model, optimizer, train_loader, scheduler, device): 
    model.to(device)
    n = len(train_loader)
    best_loss = np.inf
    
    for epoch in range(1,CFG.num_epochs):  
        model.train()
        running_loss = 0.0
        
        
        for wav, label in tqdm(iter(train_loader)):
            wav, label = wav.to(device), label.to(device)
            optimizer.zero_grad() 
        
            logit = model(wav)
            loss = criterion(logit, label) 
            
            loss.backward() 
            optimizer.step()  
            running_loss += loss.item()
        print(f'-----  Epoch {epoch} -----')     
        print('Train loss: %.10f' %(running_loss / len(train_loader)))
        
        if scheduler is not None:
            scheduler.step()
            
        model.eval()  
        valid_loss = 0.0
        correct = 0

        real_list = []
        pred_list = []
       
        with torch.no_grad(): 
            for wav, label in tqdm(iter(valid_loader)):
                
                wav, label = wav.to(device), label.to(device)
                logit = model(wav)
                valid_loss += criterion(logit, label)
                pred = logit.argmax(dim=1, keepdim=False) 
                correct += pred.eq(label.view_as(pred)).sum().item() 

                real_list.extend(label.cpu().tolist())
                pred_list.extend(pred.cpu().tolist())

        valid_acc = 100 * correct / len(valid_loader.dataset)
        valid_loss_total = valid_loss / len(valid_loader)
        print('Vaild set: Loss: {:.4f}, Accuracy: {}/{} ( {:.0f}%)\n'.format(valid_loss / len(valid_loader), correct, len(valid_loader.dataset), 100 * correct / len(valid_loader.dataset)))

        content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {(running_loss / len(train_loader)):.5f}, valid loss: {valid_loss / len(valid_loader):.5f}, Accuracy : {correct}/{len(valid_loader.dataset)} ({100 * correct / len(valid_loader.dataset)})%'

        with open(os.path.join('./audio_logs', f'log_{CFG.kernel_name}.txt'), 'a') as appender:
            appender.write(content + '\n')
        
        if best_loss > valid_loss_total:
            best_loss = valid_loss_total
            torch.save(model.state_dict(), f'./audio_model/{CFG.kernel_name}.pth')  
            print('Model Saved.')

# Run

In [20]:
train(model, optimizer, train_loader, scheduler, device)

  from ipykernel import kernelapp as app
100%|██████████| 230/230 [01:39<00:00,  2.31it/s]


-----  Epoch 1 -----
Train loss: 0.6093886139


100%|██████████| 77/77 [00:33<00:00,  2.31it/s]


Vaild set: Loss: 0.2928, Accuracy: 669/766 ( 87%)

Model Saved.


100%|██████████| 230/230 [01:40<00:00,  2.29it/s]


-----  Epoch 2 -----
Train loss: 0.2505155540


100%|██████████| 77/77 [00:33<00:00,  2.33it/s]


Vaild set: Loss: 0.3038, Accuracy: 658/766 ( 86%)



100%|██████████| 230/230 [01:39<00:00,  2.31it/s]


-----  Epoch 3 -----
Train loss: 0.1915025198


100%|██████████| 77/77 [00:32<00:00,  2.34it/s]


Vaild set: Loss: 0.2053, Accuracy: 706/766 ( 92%)

Model Saved.


100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 4 -----
Train loss: 0.1888234310


100%|██████████| 77/77 [00:32<00:00,  2.33it/s]


Vaild set: Loss: 0.2372, Accuracy: 693/766 ( 90%)



100%|██████████| 230/230 [01:38<00:00,  2.33it/s]


-----  Epoch 5 -----
Train loss: 0.1489587832


100%|██████████| 77/77 [00:33<00:00,  2.33it/s]


Vaild set: Loss: 0.1910, Accuracy: 710/766 ( 93%)

Model Saved.


100%|██████████| 230/230 [01:39<00:00,  2.31it/s]


-----  Epoch 6 -----
Train loss: 0.1332320378


100%|██████████| 77/77 [00:33<00:00,  2.33it/s]


Vaild set: Loss: 0.2138, Accuracy: 706/766 ( 92%)



100%|██████████| 230/230 [01:39<00:00,  2.32it/s]


-----  Epoch 7 -----
Train loss: 0.1294843637


100%|██████████| 77/77 [00:32<00:00,  2.36it/s]


Vaild set: Loss: 0.1907, Accuracy: 725/766 ( 95%)

Model Saved.


100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 8 -----
Train loss: 0.1106897195


100%|██████████| 77/77 [00:32<00:00,  2.34it/s]


Vaild set: Loss: 0.1742, Accuracy: 720/766 ( 94%)

Model Saved.


100%|██████████| 230/230 [01:38<00:00,  2.33it/s]


-----  Epoch 9 -----
Train loss: 0.0773953235


100%|██████████| 77/77 [00:32<00:00,  2.35it/s]


Vaild set: Loss: 0.2704, Accuracy: 713/766 ( 93%)



100%|██████████| 230/230 [01:37<00:00,  2.35it/s]


-----  Epoch 10 -----
Train loss: 0.0816413134


100%|██████████| 77/77 [00:32<00:00,  2.38it/s]


Vaild set: Loss: 0.2857, Accuracy: 716/766 ( 93%)



100%|██████████| 230/230 [01:37<00:00,  2.35it/s]


-----  Epoch 11 -----
Train loss: 0.0757656947


100%|██████████| 77/77 [00:32<00:00,  2.38it/s]


Vaild set: Loss: 0.2815, Accuracy: 708/766 ( 92%)



100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 12 -----
Train loss: 0.0844450480


100%|██████████| 77/77 [00:32<00:00,  2.37it/s]


Vaild set: Loss: 0.2207, Accuracy: 728/766 ( 95%)



100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 13 -----
Train loss: 0.0870361663


100%|██████████| 77/77 [00:32<00:00,  2.38it/s]


Vaild set: Loss: 0.2673, Accuracy: 702/766 ( 92%)



100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 14 -----
Train loss: 0.0963253081


100%|██████████| 77/77 [00:32<00:00,  2.34it/s]


Vaild set: Loss: 0.1973, Accuracy: 708/766 ( 92%)



100%|██████████| 230/230 [01:38<00:00,  2.33it/s]


-----  Epoch 15 -----
Train loss: 0.0647013936


100%|██████████| 77/77 [00:32<00:00,  2.35it/s]


Vaild set: Loss: 0.2171, Accuracy: 727/766 ( 95%)



100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 16 -----
Train loss: 0.0330637417


100%|██████████| 77/77 [00:32<00:00,  2.37it/s]


Vaild set: Loss: 0.2728, Accuracy: 719/766 ( 94%)



100%|██████████| 230/230 [01:38<00:00,  2.33it/s]


-----  Epoch 17 -----
Train loss: 0.0429976419


100%|██████████| 77/77 [00:32<00:00,  2.36it/s]


Vaild set: Loss: 0.3486, Accuracy: 706/766 ( 92%)



100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 18 -----
Train loss: 0.0561052588


100%|██████████| 77/77 [00:32<00:00,  2.36it/s]


Vaild set: Loss: 0.3533, Accuracy: 705/766 ( 92%)



100%|██████████| 230/230 [01:38<00:00,  2.34it/s]


-----  Epoch 19 -----
Train loss: 0.0623354742


100%|██████████| 77/77 [00:33<00:00,  2.33it/s]


Vaild set: Loss: 0.3783, Accuracy: 721/766 ( 94%)



100%|██████████| 230/230 [01:39<00:00,  2.32it/s]


-----  Epoch 20 -----
Train loss: 0.0565891877


100%|██████████| 77/77 [00:34<00:00,  2.26it/s]


Vaild set: Loss: 0.2539, Accuracy: 712/766 ( 93%)



100%|██████████| 230/230 [01:41<00:00,  2.27it/s]


-----  Epoch 21 -----
Train loss: 0.0594024644


100%|██████████| 77/77 [00:33<00:00,  2.32it/s]


Vaild set: Loss: 0.3359, Accuracy: 724/766 ( 95%)



100%|██████████| 230/230 [01:48<00:00,  2.12it/s]


-----  Epoch 22 -----
Train loss: 0.0239055073


100%|██████████| 77/77 [00:46<00:00,  1.64it/s]


Vaild set: Loss: 0.3362, Accuracy: 700/766 ( 91%)



100%|██████████| 230/230 [02:22<00:00,  1.62it/s]


-----  Epoch 23 -----
Train loss: 0.0500455304


100%|██████████| 77/77 [00:46<00:00,  1.65it/s]


Vaild set: Loss: 0.4757, Accuracy: 710/766 ( 93%)



100%|██████████| 230/230 [02:22<00:00,  1.62it/s]


-----  Epoch 24 -----
Train loss: 0.0567782752


100%|██████████| 77/77 [00:47<00:00,  1.63it/s]


Vaild set: Loss: 0.4385, Accuracy: 709/766 ( 93%)



100%|██████████| 230/230 [02:19<00:00,  1.65it/s]


-----  Epoch 25 -----
Train loss: 0.0525973833


100%|██████████| 77/77 [00:34<00:00,  2.25it/s]


Vaild set: Loss: 0.3656, Accuracy: 718/766 ( 94%)



100%|██████████| 230/230 [01:42<00:00,  2.24it/s]


-----  Epoch 26 -----
Train loss: 0.0382134621


100%|██████████| 77/77 [00:33<00:00,  2.31it/s]


Vaild set: Loss: 0.4223, Accuracy: 717/766 ( 94%)



100%|██████████| 230/230 [01:40<00:00,  2.28it/s]


-----  Epoch 27 -----
Train loss: 0.0049557147


100%|██████████| 77/77 [00:32<00:00,  2.33it/s]


Vaild set: Loss: 0.4210, Accuracy: 728/766 ( 95%)



100%|██████████| 230/230 [01:40<00:00,  2.29it/s]


-----  Epoch 28 -----
Train loss: 0.0012117462


100%|██████████| 77/77 [00:33<00:00,  2.31it/s]


Vaild set: Loss: 0.3937, Accuracy: 725/766 ( 95%)



100%|██████████| 230/230 [01:40<00:00,  2.30it/s]


-----  Epoch 29 -----
Train loss: 0.0004384588


100%|██████████| 77/77 [00:33<00:00,  2.32it/s]

Vaild set: Loss: 0.4242, Accuracy: 727/766 ( 95%)






# Test

In [22]:
os.getcwd()

'C:\\Users\\lab\\Desktop\\yj\\FINAL_TEST_CODE\\code\\deep_learning'

In [26]:
best_model = CNNclassification().to(device)
best_model.load_state_dict(torch.load(f'./audio_model/{CFG.kernel_name}.pth'))

test_dataset = CustomDataset('../../dataset/test.csv', f_method, train_mode = False, transforms = None)
test_loader = DataLoader(test_dataset, batch_size = CFG.batch_size, shuffle=False, num_workers=0)

In [28]:
best_model.eval() 
test_loss = 0.0
correct = 0

real_list = []
pred_list = []
prob_list = []

with torch.no_grad(): 
    for wav, label in tqdm(iter(test_loader)):
        
        wav, label = wav.to(device), label.to(device)
        logit = best_model(wav)
        vali_loss+=criterion(logit, label)
        prob = F.sigmoid(logit[:,1])
        prob_list.extend(prob.cpu().tolist())
        pred = logit.argmax(dim=1, keepdim=False)  
        correct += pred.eq(label.view_as(pred)).sum().item()  
        real_list.extend(label.cpu().tolist())
        pred_list.extend(pred.cpu().tolist())


  from ipykernel import kernelapp as app
100%|██████████| 77/77 [00:35<00:00,  2.18it/s]


# Evaluation

In [29]:
test_acc = 100 * correct / len(test_loader.dataset)
test_loss_total = test_loss / len(test_loader)
print("test_loader: ", len(test_loader))
print('Test Loss: {:.4f}, Accuracy: {}/{} ( {:.0f}%)\n'.format(test_loss_total, correct, len(test_loader.dataset), test_acc))
 
print(classification_report(real_list, pred_list)) 
fpr, tpr, thresholds = metrics.roc_curve(real_list, prob_list)
AUC = metrics.auc(fpr, tpr)

CEL = log_loss(real_list, prob_list)

print("CEL: ", CEL)
print("AUC: ", AUC) 

test_loader:  77
Test Loss: 0.0000, Accuracy: 725/766 ( 95%)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       369
           1       0.94      0.96      0.95       397

    accuracy                           0.95       766
   macro avg       0.95      0.95      0.95       766
weighted avg       0.95      0.95      0.95       766

CEL:  0.18426052004349008
AUC:  0.9811868143870356
mfcc_16000
