In [None]:
import numpy as np
from pathlib import Path
import librosa
from collections import defaultdict
import soundfile as sf
import tqdm
import os
from multiprocessing import Pool, cpu_count
import cv2

%matplotlib inline

In [None]:
INPUT =  "../input/train_resampled"
OUTPUT = "../output/train_npz"
SAMPLE_RATE = 32_000
MU = 256
NUM_WORKERS = cpu_count()

print(NUM_WORKERS)

In [None]:
def audio_to_spec(audio):
    spec = librosa.power_to_db(
        librosa.feature.melspectrogram(audio, sr=SAMPLE_RATE, fmin=20, fmax=16000, n_mels=128)
    )
    return spec.astype(np.float32)

def audio2vec(path):
    x, _ = sf.read(path)
    x_spex = audio_to_spec(x)
    np.save(f"{OUTPUT}/{path.parent.name}/{path.name}.npz", x_spex)
    
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def audio2pict(path):
    x, _ = sf.read(path)
    x_spex = audio_to_spec(x)
    cv2.imwrite(f"{OUTPUT}/{path.parent.name}/{path.name}.jpg", mono_to_color(x_spex))
    
def audio2quantized(path):
    data, _ = librosa.load(path=path, sr=SAMPLE_RATE, mono=True)
    mu_x = np.sign(data) * np.log(1 + MU * np.abs(data)) / np.log(MU + 1)
    bins = np.linspace(-1, 1, MU)
    quantized = np.digitize(mu_x, bins) - 1
    quantized = quantized.astype(np.uint8)
    np.save(f"{OUTPUT}/{path.parent.name}/{path.name}.npz", quantized)

In [None]:
recs = defaultdict(list)
for directory in tqdm.tqdm_notebook(Path(INPUT).iterdir(), total=len(os.listdir(INPUT))):
    if directory.name == ".DS_Store":
        continue
    !mkdir -p "{OUTPUT}/{directory.name}"
    file_paths = [f for f in directory.iterdir() if f.name != ".DS_Store"]
    with Pool(NUM_WORKERS // 2) as p:
        #p.map(audio2vec, file_paths)
        #p.map(audio2pict, file_paths)
        p.map(audio2quantized, file_paths)

### check ignore files

In [None]:
for directory in tqdm.tqdm_notebook(Path(OUTPUT).iterdir(), total=len(os.listdir(OUTPUT))):
    if directory.name == ".DS_Store":
        continue
    file_paths = [f for f in directory.iterdir() if f.name != ".DS_Store"]
    for path in file_paths:
        size = os.path.getsize(path)
        if size < 1:
            print(path)

In [None]:
paths = [
    f"{INPUT}/comrav/XC246425.wav",
    f"{INPUT}/prawar/XC479026.wav",
    f"{INPUT}/snobun/XC487557.wav",
    f"{INPUT}/snobun/XC487556.wav",
    f"{INPUT}/stejay/XC503349.wav"
]

In [None]:
x, _ = sf.read(paths[0])
x_spex = audio_to_spec(x)

print(x_spex.shape)
cv2.imwrite(f"tmp.jpg", mono_to_color(x_spex))

### Audio Detection

In [None]:
from matplotlib import pyplot as plt
import torch

In [None]:
arr = np.load("../output/train_npz/aldfly/XC134874.wav.npz.npy")

In [None]:
plt.plot(arr)

In [None]:
arr.shape

In [None]:
spect = librosa.feature.melspectrogram(arr.astype(float), sr=SAMPLE_RATE, fmin=20, fmax=16000, n_mels=128)

In [None]:
plt.plot(spect.max(0))

In [None]:
x = torch.Tensor(spect)
x = x.unsqueeze(0)
x.shape

In [None]:
h = torch.nn.MaxPool1d(32)(x)
h.shape

In [None]:
plt.plot(h[0].numpy().max(0))

In [None]:
plt.plot(h[0].numpy().max(0) > 0.1*1e7)

### noise analysis

In [None]:
from matplotlib import pyplot as plt
import IPython

In [None]:
recs = defaultdict(list)
for directory in tqdm.tqdm_notebook(Path(INPUT).iterdir(), total=len(os.listdir(INPUT))):
    if directory.name == ".DS_Store":
        continue
    file_paths = [f for f in directory.iterdir() if f.name != ".DS_Store"]
    break

In [None]:
path = file_paths[4]
path = "../input/train_resampled/ameavo/XC304534.wav"
print(path)
data, sr = librosa.load(path=path, sr=SAMPLE_RATE, mono=True)
plt.plot(data[:160000], ',', linestyle="None");plt.show()

x_spex = audio_to_spec(data)
pct = mono_to_color(x_spex)
plt.imshow(pct);plt.show()

IPython.display.Audio(data=data, rate=sr)

In [None]:
N = len(data)
dt = 32000

F = np.fft.fft(data)
F_abs = np.abs(F)
F_abs_amp = F_abs / N * 2

fq = np.linspace(0, 1.0/dt, N)

plt.xlabel('freqency(Hz)', fontsize=14)
plt.ylabel('amplitude', fontsize=14)
plt.plot(fq, F_abs_amp)

In [None]:
fc = 1e-5 # カットオフ（周波数）
F[(fq > fc)] = 0

#ac = 0.00002 # 振幅強度の閾値
#F[(F_abs_amp < ac)] = 0

F_abs = np.abs(F)
F_abs_amp = F_abs / N * 2

plt.xlabel('freqency(Hz)', fontsize=14)
plt.ylabel('amplitude', fontsize=14)
plt.plot(fq, F_abs_amp)

In [None]:
F2_ifft = np.fft.ifft(F)
F2_ifft_real = F2_ifft.real * 2

IPython.display.Audio(data=F2_ifft_real, rate=sr)

### More Infomation

In [None]:
import pandas as pd
train = pd.read_csv("../input/train.csv")
train.head(3)

In [None]:
train["multi_label"] = train.apply(lambda x: [x["primary_label"]] + eval(x["secondary_labels"]) ,axis=1)

primary_label2ebird_code = {
    df["primary_label"].unique()[0]: ebird_code 
    for ebird_code, df in train[["ebird_code", "primary_label"]].groupby("ebird_code")
}

lst = []
for multi_label in train["multi_label"]:
    _lst = []
    for lab in multi_label:
        try:
            code = primary_label2ebird_code[lab]
        except KeyError:
            continue
        _lst.append(code)
    lst.append(_lst)
train["multi_ebird_code"] = lst

In [None]:
def type2label(t):
    t = t.lower()
    d = [int("call" in t), int("song" in t)]
    return d

train["type_label"] = train["type"].map(type2label)

In [None]:
train[["multi_ebird_code", "type_label"]].sample(10)