In [1]:
# main libraries
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import pandas as pd
import cv2
import soundfile as sf
import typing as tp
import librosa
from sklearn.model_selection import StratifiedKFold

# sub modules
from pathlib import Path
import warnings
warnings.simplefilter('ignore')

In [2]:
# directories

BASE_DIR = Path('/Users/yohei/Documents/bird2021/solution4birdcall2021')
SMALLDATA_DIR = BASE_DIR / 'data' / 'small'
TRAINDATA_DIR = SMALLDATA_DIR / 'train_short_audio'
LOG_DIR = BASE_DIR / 'reports' / 'logs'
OUTPUT_DIR = BASE_DIR / 'models' / 'output'

In [3]:
train_meta_path = SMALLDATA_DIR / 'train_metadata.csv'

In [4]:
train_meta = pd.read_csv(train_meta_path)

# ここでは2種類の鳥のみ扱う.
train_meta = train_meta.query('primary_label in ["acafly", "acowoo"]')

train_meta.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url
0,acafly,['amegfi'],"['begging call', 'call', 'juvenile']",35.386,-84.125,Empidonax virescens,Acadian Flycatcher,Mike Nelson,2012-08-12,XC109605.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,09:30,https://www.xeno-canto.org/109605
1,acafly,[],['call'],9.1334,-79.6501,Empidonax virescens,Acadian Flycatcher,Allen T. Chartier,2000-12-26,XC11209.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,?,https://www.xeno-canto.org/11209
2,acafly,[],['call'],5.7813,-75.7452,Empidonax virescens,Acadian Flycatcher,Sergio Chaparro-Herrera,2012-01-10,XC127032.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,15:20,https://www.xeno-canto.org/127032
3,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129974.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129974
4,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129981.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129981


In [5]:
BIRD_CODE = {'acafly': 0, 'acowoo': 1}
INV_BIRD_CODE = {v: k for k, v in BIRD_CODE.items()}

In [6]:
INV_BIRD_CODE

{0: 'acafly', 1: 'acowoo'}

In [7]:
PERIOD = 5

def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

In [8]:
class SpectrogramDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        file_list: [[str, str]], img_size=224,
        waveform_transforms=None, spectrogram_transforms=None, melspectrogram_parameters={}
    ):
        self.file_list = file_list  # list of list: [file_path, ebird_code]
        self.img_size = img_size
        self.waveform_transforms = waveform_transforms
        self.spectrogram_transforms = spectrogram_transforms
        self.melspectrogram_parameters = melspectrogram_parameters

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx: int):
        ebird_name, ogg_file_name = self.file_list[idx]
        ogg_path = TRAINDATA_DIR / ebird_name / ogg_file_name
        ebird_code = BIRD_CODE[ebird_name]
        y, sr = sf.read(ogg_path)
        
        # soundをaugmentする. shifting, pitching, noising.
        if self.waveform_transforms:
            y = self.waveform_transforms(y)
        else:
            len_y = len(y)
            effective_length = sr * PERIOD
            # randomにsoundを切り取るのではなく, sound event detectionするべき.
            if len_y < effective_length:
                new_y = np.zeros(effective_length, dtype=np.float64)
                start = np.random.randint(effective_length - len_y)
                new_y[start:start + len_y] = y
                y = new_y.astype(np.float64)
            elif len_y > effective_length:
                start = np.random.randint(len_y - effective_length)
                y = y[start:start+effective_length].astype(np.float32)
            else:
                y = y.astype(np.float64)

        melspec = librosa.feature.melspectrogram(y, sr=sr, **self.melspectrogram_parameters)
        melspec = librosa.power_to_db(melspec).astype(np.float64)

        if self.spectrogram_transforms:
            melspec = self.spectrogram_transforms(melspec)
        else:
            pass

        image = mono_to_color(melspec)
        height, width, _ = image.shape
        image = cv2.resize(image, (int(width * self.img_size / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

#       labels = np.zeros(len(BIRD_CODE), dtype="i")
        labels = np.zeros(len(BIRD_CODE), dtype="f")
        labels[BIRD_CODE[ebird_name]] = 1

        return image, labels

In [9]:
# define device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# define model
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b0')

model._fc = nn.Sequential(
        nn.Linear(1280, 512), nn.ReLU(), nn.Dropout(p=0.2),
        nn.Linear(512, 512), nn.ReLU(), nn.Dropout(p=0.2),
        nn.Linear(512, 2))
model.to(device)

# loss function, optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Loaded pretrained weights for efficientnet-b0


In [10]:
# validation

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

In [11]:
train_meta["fold"] = -1
for fold_id, (train_index, val_index) in enumerate(skf.split(train_meta, train_meta["primary_label"])):
    train_meta.iloc[val_index, -1] = fold_id

In [12]:
train_meta.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url,fold
0,acafly,['amegfi'],"['begging call', 'call', 'juvenile']",35.386,-84.125,Empidonax virescens,Acadian Flycatcher,Mike Nelson,2012-08-12,XC109605.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,09:30,https://www.xeno-canto.org/109605,0
1,acafly,[],['call'],9.1334,-79.6501,Empidonax virescens,Acadian Flycatcher,Allen T. Chartier,2000-12-26,XC11209.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,?,https://www.xeno-canto.org/11209,3
2,acafly,[],['call'],5.7813,-75.7452,Empidonax virescens,Acadian Flycatcher,Sergio Chaparro-Herrera,2012-01-10,XC127032.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,15:20,https://www.xeno-canto.org/127032,3
3,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129974.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129974,2
4,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129981.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129981,0


In [13]:
use_fold = 0

train_file_list = train_meta.query("fold != @use_fold")[['primary_label', 'filename']].values.tolist()
valid_file_list = train_meta.query("fold == @use_fold")[['primary_label', 'filename']].values.tolist()
train_dataset = SpectrogramDataset(file_list=train_file_list)
valid_dataset = SpectrogramDataset(file_list=valid_file_list)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=5)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=5)

In [15]:
# tensorboard config
writer = SummaryWriter(log_dir=LOG_DIR)

epochs = 50
for epoch in range(epochs):
    # training
    model.train()
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_func(output, target)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    print({'train/loss': epoch_loss})
    writer.add_scalar('train/loss', epoch_loss, global_step=epoch)
    
    # evaluationg
    model.eval()
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(valid_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)
        loss = loss_func(output, target)
        epoch_loss += loss.item()
    print({'valid/loss': epoch_loss})
    writer.add_scalar('train/loss', epoch_loss, global_step=epoch)
    
writer.close()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


KeyboardInterrupt: 

In [None]:
# save model
torch.save(model.state_dict(), 'model_gpu.pth')
torch.save(model.state_dict().to('cpu'), 'model_cpu.pth')