In [None]:
# Install libraries for data_loader
!pip install pandas
!pip install sklearn
!pip install albumentations
!pip install torchvision
!pip install tqdm

In [None]:
# Import DataLoader and corresponding libraries
import pandas
import torchvision.transforms as TT
# import albumentations as T
# import albumentations.augmentations.transforms as T_transforms
# from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, Dataset, sampler
from sklearn.model_selection import StratifiedKFold
from torchvision import utils
from PIL import Image

In [None]:
# Import libraries for tensors
import numpy as np
import torch

In [None]:
# Import tqdm for progress bar construction
import tqdm

In [None]:
# Datatypes and Devices (from Assignment 2)
dtype = torch.float
ltype = torch.long

if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

print('using device:', device)

In [None]:
# Hyper-parameters for K-Fold Cross Validation
N = 5
seed = 42

# Directories for Data
FF1010_Path = './data/'
AudioImage_Path = './image/'

In [None]:
# Call StratifiedKFold object
skf = StratifiedKFold(
    n_splits=5, shuffle=True, random_state=seed
)

In [None]:
# Modify dataframe for K-Fold Cross Validation (ff1010)
ff1010_csv = pandas.read_csv(FF1010_Path + 'metadata.csv')
ff1010_csv.loc[ff1010_csv['hasbird'] == 0, 'filepath'] = \
    FF1010_Path + 'nocall/' + ff1010_csv.query('hasbird == 0')['filename'] + '.npy'
ff1010_csv.loc[ff1010_csv['hasbird'] == 1, 'filepath'] = \
    FF1010_Path + 'bird/' + ff1010_csv.query('hasbird == 1')['filename'] + '.npy'

ff1010_csv = ff1010_csv.dropna()
ff1010_csv = ff1010_csv.reset_index(drop=True)

# Add 'fold' attribute for dataset classification
ff1010_dataframe = ff1010_csv.copy()
for n, (_, nth_groups) in enumerate(
    skf.split(ff1010_dataframe, ff1010_dataframe['hasbird'])):
    ff1010_dataframe.loc[nth_groups, 'fold'] = int(n)

In [None]:
# # Modify dataframe for K-Fold Cross Validation (birdclef2021)
# birdclef_csv = pandas.read_csv(AudioImage_Path + 'metadata.csv')
# birdclef_csv.loc[birdclef_csv['label_id'] >= 0,'filepath'] = \
#     AudioImage_Path + birdclef_csv.query('label_id >= 0')['primary_label'] + '/' + \
#     birdclef_csv.query('label_id >= 0')['filename'] + '.npy'

# birdclef_csv = birdclef_csv.dropna()
# birdclef_csv = birdclef_csv.reset_index(drop=True)

# # Add 'fold' attribute for dataset classification
# birdclef_dataframe = birdclef_csv.copy()
# for n, (_, nth_groups) in enumerate(
#     skf.split(birdclef_dataframe, birdclef_dataframe['label_id'])):
#     birdclef_dataframe.loc[nth_groups, 'fold'] = int(n)

In [None]:
# Hyper-parameters for training 
ff1010_batch = 32
birdclef_batch = 32

Model 1 (No-call detector)

In [None]:
# Class for ff1010 dataset
class FF1010(Dataset):
    def __init__(self, dataframe, process='train', labels='hasbird'):
        self.dataframe = dataframe
        self.filepaths = dataframe['filepath'].values
        self.labels = dataframe[labels].values
        self.process = process
        
        # Transforms for each train and validation
        self.train_transform = TT.Compose([
            TT.Resize([128, 281]),
            TT.HorizontalFlip(p=0.5),
            TT.VerticalFlip(p=0.5),
#             T_transforms.ImageCompression(p=0.5, 
#                 compression_type=T_transforms.ImageCompression.ImageCompressionType.JPEG),
#             T_transforms.ImageCompression(p=0.5, 
#                 compression_type=T_transforms.ImageCompression.ImageCompressionType.WEBP),
            TT.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            TT.ToTensor(),
        ])
        self.val_transform = TT.Compose([
            TT.Resize([128, 281]),
            TT.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            TT.ToTensor(),
        ])
    
    def __getitem__(self, idx):
        source = np.load(self.filepaths[idx])
        
        # Rearrange numpy arrays
        source = source.transpose(1, 2, 0)
        # Add RGB dimension
        source = np.stack((np.squeeze(source), ) * 3, -1)
        
        # Apply transform
        if self.process == 'train':
            transformed = self.train_transform(image=source)
            source = transformed['image'].to(device)
        elif self.process == 'valid':
            transformed = self.val_transform(image=source)
            source = transformed['image'].to(device)
        
        return source, torch.tensor(self.labels[idx], dtype=ltype).to(device)
    
    def __len__(self):
        return len(self.dataframe)

In [None]:
from time import sleep

# Train loop for nocall detector
def nocall_train(train_dataframe, val_dataframe):  
    train_data = FF1010(train_dataframe, process='train', labels='hasbird')
    val_data = FF1010(val_dataframe, process='valid', labels='hasbird')
    
    # Construct data loader for train and validation
    train_loader = DataLoader(train_data, batch_size=ff1010_batch,
                             sampler=sampler.SubsetRandomSampler(range(len(train_dataframe))), 
                             drop_last=True)
    val_loader = DataLoader(val_data, batch_size=ff1010_batch,
                             sampler=sampler.SubsetRandomSampler(range(len(val_dataframe))),
                             drop_last=False)
    
    # Test for loaders
    
    # TODO
    # 이쪽에 training 구현하시면 됩니다.
    for index, (source, label) in enumerate(tqdm.tqdm(train_loader)):
        sleep(0.01)
        
    # TODO
    # 이쪽에 validation 구현하시면 됩니다.
    for index, (source, label) in enumerate(tqdm.tqdm(val_loader)):
        sleep(0.01)
    
    val_losses = None
    train_losses = None
    return val_losses, train_losses

In [None]:
# Train, Validate and Test for nocall detector
def nocall(dataframe, val_index, test_index):
    
    # Check that validation fold is not same as test fold
    assert val_index != test_index, \
        'Validation and test should be done on different fold.'
    
    train_dataframe = dataframe.query(
        'fold != ' + str(val_index) + ' and fold != ' + str(test_index) 
    ).reset_index(drop=True)
    val_dataframe = dataframe.query(
        'fold == ' + str(val_index) 
    ).reset_index(drop=False)
    
    val_losses, train_losses = nocall_train(train_dataframe, val_dataframe)
    
    # TODO
    # 이쪽에 Accuracy test 구현하시면 됩니다.
    
    return

In [None]:
nocall(ff1010_dataframe, 0, 1)

Model 2 (Bird classificator)

In [None]:
# Class for mel-spectrogram images dataset
class AudioImage(Dataset):
    def __init__(self, dataframe, process='train', labels='label_id'):
        self.dataframe = dataframe
        self.filepaths = dataframe['filepath'].values
        self.labels = dataframe[labels].values
        self.process = process
        
        # Transforms for each train and validation
        self.transform = TT.Compose([
            TT.Resize([128, 281]),
            TT.ToTensor(),
            TT.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
        ])
    
    def __getitem__(self, idx):
        source = np.load(self.filepaths[idx])
        
        # Rearrange numpy arrays
        source = source.transpose(1, 2, 0)
        
        # Add RGB dimension
        source = np.stack((np.squeeze(source), ) * 3, -1)
        if len(source.shape) == 3:
            source = np.expand_dims(source, axis=2)
        source = source.transpose(2, 0, 1, 3)
        N, H, W, C = source.shape
        
        # Apply transform
        augmented = torch.zeros(N, C, H, W).to(device)
        for i in range(N):
            augmented[i] = self.transform(Image.fromarray(source[i])).to(device)
        
        return source, torch.tensor(self.labels[idx], dtype=ltype).to(device)
    
    def __len__(self):
        return len(self.dataframe)

In [None]:
# Train loop for bird specification
def bird_train(train_dataframe, val_dataframe):  
    train_data = AudioImage(train_dataframe, process='train', labels='label_id')
    val_data = AudioImage(val_dataframe, process='valid', labels='label_id')
    
    # Construct data loader for train and validation
    train_loader = DataLoader(train_data, batch_size=birdclef_batch,
                             sampler=sampler.SubsetRandomSampler(range(len(train_dataframe))), 
                             drop_last=True)
    val_loader = DataLoader(val_data, batch_size=birdclef_batch,
                             sampler=sampler.SubsetRandomSampler(range(len(val_dataframe))),
                             drop_last=False)
    
    # Test for loaders
    
    # TODO
    # 이쪽에 training 구현하시면 됩니다.
    for index, (source, label) in enumerate(tqdm.tqdm(train_loader)):
        sleep(0.01)
        
    # TODO
    # 이쪽에 validation 구현하시면 됩니다.
    for index, (source, label) in enumerate(tqdm.tqdm(val_loader)):
        sleep(0.01)
    
    val_losses = None
    train_losses = None
    return val_losses, train_losses

In [None]:
# Train, Validate and Test for bird specification
def bird(dataframe, val_index, test_index):
    
    # Check that validation fold is not same as test fold
    assert val_index != test_index, \
        'Validation and test should be done on different fold.'
    
    train_dataframe = dataframe.query(
        'fold != ' + str(val_index) + ' and fold != ' + str(test_index) 
    ).reset_index(drop=True)
    val_dataframe = dataframe.query(
        'fold == ' + str(val_index) 
    ).reset_index(drop=False)
    
    val_losses, train_losses = bird_train(train_dataframe, val_dataframe)
    
    # TODO
    # 이쪽에 Accuracy test 구현하시면 됩니다.
    
    return

In [None]:
bird(birdclef_dataframe, 0, 1)