In [1]:
import os
import warnings

import librosa
import numpy as np
import pandas as pd
from pandarallel import pandarallel

In [2]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
raw_folder = './raw/'
dataset_folder = './dataset/'
max_freq = 280
min_freq = 40


if not os.path.exists(dataset_folder):
    os.mkdir(dataset_folder)
samples = [d for d in os.listdir(raw_folder) if os.path.isdir(os.path.join(raw_folder, d))]
n_samples = len(samples)
print(samples)

['cv-valid-dev', 'cv-valid-test', 'cv-valid-train']


In [4]:
columns = ["meanfreq", "sd", "median", "q25", "q75", "iqr", "skew", "kurt", "label"]

In [10]:
def properties(y: np.ndarray, fs: int):
    spec = np.abs(np.fft.rfft(y))
    freq = np.fft.rfftfreq(len(y), d=1 / fs)

    points_per_freq = len(freq) / (fs / 2)
    spec[int(points_per_freq * max_freq) + 1 : ] = 0
    spec[0 : int(points_per_freq * min_freq)] = 0
    if spec.sum() == 0:
        return [np.NaN for _ in range(len(columns) - 1)]
    amp = spec / spec.sum()
    mean = (freq * amp).sum()
    sd = np.sqrt(np.sum(amp * ((freq - mean) ** 2)))
    amp_cumsum = np.cumsum(amp)
    median = freq[len(amp_cumsum[amp_cumsum <= 0.5]) + 1]
    mode = freq[amp.argmax()]
    q25 = freq[len(amp_cumsum[amp_cumsum <= 0.25]) + 1]
    q75 = freq[len(amp_cumsum[amp_cumsum <= 0.75]) + 1]
    iqr = q75 - q25
    z = amp - amp.mean()
    w = amp.std()
    skew = ((z ** 3).sum() / (len(spec) - 1)) / w ** 3
    kurt = ((z ** 4).sum() / (len(spec) - 1)) / w ** 4
    return [mean, sd, median, q25, q75, iqr, skew, kurt]

In [11]:
def extract_data(row, wav_folder):
    filename = os.path.join(wav_folder, row['filename'])

    gender = row['gender']

    warnings.filterwarnings("ignore")
    data, rate = librosa.load(filename, sr=None)
    warnings.filterwarnings("default")

    # разбиваем на аудио фрагменты по 5 секунд
    step = rate * 5
    props = [[] for _ in range(columns.__len__())]
    for i in range(0, len(data), step):
        if len(data[i : i + step]) > rate and data[i : i + step].any():
            prop = properties(data[i : i + step], rate)
            prop.append(gender)
            for p, p_list in zip(prop, props):
                p_list.append(p)
    return props

In [12]:
for sample in samples:
    wav_folder = os.path.join(raw_folder, sample)
    descr_file = os.path.join(raw_folder, sample + '.csv')

    df = pd.read_csv(descr_file).dropna(subset=['gender']).reset_index()

    print(sample)
    myData = df.parallel_apply(extract_data, wav_folder=wav_folder, axis=1, result_type='expand')\
        .set_axis(columns, axis='columns')\
        .explode(columns, ignore_index=True)\
        .dropna()

    myData.to_csv(os.path.join(dataset_folder, sample + '.csv'), index=False)

cv-valid-dev


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=385), Label(value='0 / 385'))), HB…

cv-valid-test


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=386), Label(value='0 / 386'))), HB…

cv-valid-train


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=18515), Label(value='0 / 18515')))…