In [1]:
import warnings
from pathlib import Path
from joblib import delayed, Parallel

import librosa
import audioread
import soundfile as sf

import pandas as pd

In [2]:
TARGET_SR = 32000
NUM_THREAD = 8  # for joblib.Parallel

In [3]:
# 元データ
TRAIN_AUDIO_DIR = Path("../../../../input//birdsong-recognition/train_audio/")
TRAIN_RESAMPLED_DIR = Path("../../../../input/birdsong-recognition/train_audio_resampled/")

# # read train.csv
train = pd.read_csv("../../../../input/birdsong-recognition/train.csv")

# # extract "ebird_code" and  "filename"
train_audio_infos = train[["ebird_code", "filename"]].values.tolist()

# # make directories for saving resampled audio
TRAIN_RESAMPLED_DIR.mkdir(parents=True)
for ebird_code in train.ebird_code.unique():
    ebird_dir = TRAIN_RESAMPLED_DIR / ebird_code
    ebird_dir.mkdir()
    
 # 追加データ
TRAIN_AUDIO_ADD_DIR = Path("../../../../input/additional_data/train_audio/")
TRAIN_RESAMPLED_ADD_DIR = Path("../../../../input/additional_data/train_audio_resampled/")

# # read train_add.csv
train_add = pd.read_csv("../../../../input/additional_data/train_add.csv")

# # extract "ebird_code" and  "filename"
train_audio_infos_add = train_add[["ebird_code", "filename"]].values.tolist()

# # make directories for saving resampled audio
TRAIN_RESAMPLED_ADD_DIR.mkdir(parents=True)
for ebird_code in train_add.ebird_code.unique():
    ebird_dir = TRAIN_RESAMPLED_ADD_DIR / ebird_code
    ebird_dir.mkdir()

In [5]:
# # define resampling function
warnings.simplefilter("ignore")
def resample(ebird_code: str, filename: str, target_sr: int, audio_dir: str, resample_dir: str):    
    ebird_dir = resample_dir / ebird_code

    try:
        y, _ = librosa.load(
            audio_dir / ebird_code / filename,
            sr=target_sr, mono=True, res_type="kaiser_fast")

        filename = filename.replace(".mp3", ".wav")
        sf.write(ebird_dir / filename, y, samplerate=target_sr)
        return "OK"
    except Exception as e:
        with open(resample_dir / "skipped.txt", "a") as f:
            file_path = str(audio_dir / ebird_code / filename)
            f.write(file_path + "\n")
        return str(e)

 ## resample for original data

In [9]:
# # resample and save audio using Parallel
msg_list = Parallel(n_jobs=NUM_THREAD, verbose=1)(
    delayed(resample)(ebird_code, file_name, TARGET_SR, TRAIN_AUDIO_DIR, TRAIN_RESAMPLED_DIR) for ebird_code, file_name in train_audio_infos)

# # add information of resampled audios to train.csv
train["resampled_sampling_rate"] = TARGET_SR
train["resampled_filename"] = train["filename"].map(
    lambda x: x.replace(".mp3", ".wav"))
train["resampled_channels"] = "1 (mono)"

train.to_csv(TRAIN_RESAMPLED_DIR / "train_mod.csv", index=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    6.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   23.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   51.6s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:  2.3min
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  3.4min
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  4.6min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  6.0min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  7.6min
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:  9.6min
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed: 11.5min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed: 13.5min
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed: 15.8min
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed: 18.2min
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed: 20.9mi

 ## resample for additional data

In [6]:
# # resample and save audio using Parallel
msg_list = Parallel(n_jobs=NUM_THREAD, verbose=1)(
    delayed(resample)(ebird_code, file_name, TARGET_SR, TRAIN_AUDIO_ADD_DIR, TRAIN_RESAMPLED_ADD_DIR) for ebird_code, file_name in train_audio_infos_add)

# # add information of resampled audios to train.csv
train_add["resampled_sampling_rate"] = TARGET_SR
train_add["resampled_filename"] = train_add["filename"].map(
    lambda x: x.replace(".mp3", ".wav"))
train_add["resampled_channels"] = "1 (mono)"

train_add.to_csv(TRAIN_RESAMPLED_ADD_DIR / "train_add_mod.csv", index=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   23.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:  1.9min
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:  3.3min
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  4.6min
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  6.0min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  7.7min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  9.9min
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed: 12.2min
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed: 15.1min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed: 17.7min
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed: 21.9min
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed: 25.4min
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed: 29.5mi