In [1]:
import os
import shutil
import math
from itertools import accumulate
from collections import Counter
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchaudio

## data organization

In [2]:
# hyperparameters
train_valid_data_dirs = ['../original_data/ff1010bird/wav/', '../original_data/warblrb10k_public/wav/']
test_data_dirs = ['../original_data/birdaudiodetectionchallenge_test/wav/']
train_valid_label_files = ['../original_data/ff1010bird/ff1010bird_metadata.csv', '../original_data/warblrb10k_public/warblrb10k_public_metadata.csv']
test_label_files = ['../original_data/birdaudiodetectionchallenge_test/badch_testset_blankresults.csv']
new_dir = '../data'
valid_ratio = 0.2

In [3]:
def copyfile(filename, target_dir):
    os.makedirs(target_dir, exist_ok=True)
    shutil.copy(filename, target_dir)

In [4]:
def read_labels(filename):
    label_df = pd.read_csv(filename)
    label_df['itemid'] = label_df['itemid'].astype(str)
    label_dict = dict(zip(label_df['itemid'], label_df['hasbird']))
    return label_dict

In [5]:
def organize_train_valid(data_dirs, new_dir, label_dicts, valid_ratio=0.1):
    train_dfs = []
    valid_dfs = []
    for data_dir, label_dict in zip(data_dirs, label_dicts):
        train_df, valid_df = _organize_train_valid(data_dir, new_dir, label_dict, valid_ratio)
        train_dfs.append(train_df)
        valid_dfs.append(valid_df)
    return pd.concat(train_dfs, ignore_index=True), pd.concat(valid_dfs, ignore_index=True)

def _organize_train_valid(data_dir, new_dir, label_dict, valid_ratio):
    n_valid = Counter(label_dict.values())
    train_df = pd.DataFrame(columns=['filename', 'label'])
    valid_df = pd.DataFrame(columns=['filename', 'label'])
    for label, n_sample in n_valid.items():
        n_valid[label] = max(1, math.floor(n_sample * valid_ratio))
    for filename in os.listdir(data_dir):
        label = label_dict[filename.split('.')[0]]
        if n_valid[label] > 0:
            copyfile(os.path.join(data_dir, filename),
                     os.path.join(new_dir, 'valid'))
            n_valid[label] -= 1
            valid_df = valid_df.append({'filename':filename, 'label': label}, ignore_index=True)
        else:
            copyfile(os.path.join(data_dir, filename),
                     os.path.join(new_dir, 'train'))
            train_df = train_df.append({'filename':filename, 'label': label}, ignore_index=True)
    return train_df, valid_df

In [6]:
def organize_test(data_dirs, new_dir):
    test_df = pd.DataFrame(columns=['filename'])
    for data_dir in data_dirs:
        for filename in os.listdir(data_dir):
            copyfile(os.path.join(data_dir, filename),
                 os.path.join(new_dir, 'test'))
            test_df = test_df.append({'filename':filename}, ignore_index=True)
    return test_df

In [7]:
train_valid_label_dicts = [read_labels(label_file) for label_file in train_valid_label_files]

In [8]:
train_df, valid_df = organize_train_valid(train_valid_data_dirs, new_dir, train_valid_label_dicts, valid_ratio=valid_ratio)

In [9]:
test_df = organize_test(test_data_dirs, new_dir)

In [10]:
train_df.to_csv('../data/train_labels.csv', sep=',', index=False)
valid_df.to_csv('../data/valid_labels.csv', sep=',', index=False)
test_df.to_csv('../data/test_labels.csv', sep=',', index=False)

#### test

In [11]:
train_df.head(2)

Unnamed: 0,filename,label
0,184751.wav,1
1,148879.wav,1


In [12]:
valid_df.head(2)

Unnamed: 0,filename,label
0,9226.wav,0
1,75860.wav,0


In [13]:
test_df.head(2)

Unnamed: 0,filename
0,3cfce2b5-40c2-4942-8590.wav
1,9e03a196-7692-4c30-96fb.wav


In [14]:
len(train_df), len(valid_df), len(test_df)

(12552, 3138, 8620)

In [15]:
len(train_df) + len(valid_df)

15690