In [None]:
import numpy as np

In [105]:
def split_signal_overlapping_windows(signal, SR=32000, window_len=10, window_hop=7):
    """ Automatically pad and split signal into overlapping windows of size window_len.
    The overlap is determined by the formula `window_len - window_hop`
    """
    signal_original_len = signal.shape[1]  # assuming 1st dim is channel
    
    pad_size = window_hop * SR - (signal_original_len - window_len * SR) % (window_hop * SR)
    signal = np.pad(signal, ((0, 0), (0, pad_size)))
    window_count = (signal.shape[1] - window_len * SR) / (window_hop * SR) + 1
    
    windows_idx = np.arange(SR * window_len, dtype=int).reshape(1, -1) + window_hop * SR * np.arange(window_count, dtype=int).reshape(-1, 1)
    signal = np.squeeze(signal[:, windows_idx])
    
    return np.split(signal, window_count, axis=0)

In [111]:
split_signal_overlapping_windows(np.ones((1, 4 * SR)))

[array([1., 1., 1., ..., 0., 0., 0.])]

In [None]:
def iterator_from_data():
    dpath = data_point.path
    data, _ = torchaudio.load(dpath)
    name = data_point.name
    
    for subsample in enumerate(split_signal_overlapping_windows(signal), 1):
        subsample, f"{raw_name}_{i}{ext}", data_point.label 
    
    if data.shape[1] <= sampling_rate * n_sec:
        raw_name, ext = os.path.splitext(name)
        yield data, f"{raw_name}_1{ext}", data_point.label
        
        return
    
    for i, chunk in enumerate(torch.split(data, n_sec * sampling_rate, dim=1), 1):
        raw_name, ext = os.path.splitext(name)
        yield chunk, f"{raw_name}_{i}{ext}", data_point.label  

In [2]:
import pandas as pd
import glob
import os

In [None]:
def extract_original_filename(file_name):
    parts = file_name.split("@")
    if len(parts) == 0:
        return file_name
    
    return parts[0]

In [3]:
df_source = pd.read_csv('/root/data/gdsc_data/data_source/train/metadata.csv')
df_transformed =  pd.read_csv('/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/metadata.csv')

In [140]:
df_transformed["original_filename"] = df_transformed["file_name"].map(extract_original_filename)

In [19]:
df_grouped = df_transformed.groupBy("original_filename").aggregate(
    {
        "file_name": lambda x: ", ".join(x),
        "label": min
    }
).reset_index()

In [6]:
df_grouped.to_csv('~/temp.csv', index=False)

Unnamed: 0,file_name,label,original_filename
0,Roeselianaroeselii_XC751814-dat028-019_edit1.wav,56,Roeselianaroeselii_XC751814-dat028-019_edit1.wav
1,Roeselianaroeselii_XC752367-dat006-010.wav,56,Roeselianaroeselii_XC752367-dat006-010.wav
2,Yoyettacelis_GBIF2465208563_IN36000894_50988.wav,64,Yoyettacelis_GBIF2465208563_IN36000894_50988.wav
3,Gomphocerippusrufus_XC752285-dat001-045.wav,26,Gomphocerippusrufus_XC752285-dat001-045.wav
4,Phaneropteranana_XC755717-221013-Phaneroptera-...,41,Phaneropteranana_XC755717-221013-Phaneroptera-...
...,...,...,...
1747,Conocephalusdorsalis_XC752036-dat004-002.wav,16,Conocephalusdorsalis_XC752036-dat004-002.wav
1748,Pseudochorthippusmontanus_XC752605-dat012-003_...,54,Pseudochorthippusmontanus_XC752605-dat012-003_...
1749,Pholidopteralittoralis_XC752439-dat049-001_edi...,44,Pholidopteralittoralis_XC752439-dat049-001_edi...
1750,Gomphocerussibiricus_XC751796-dat187-005_edit2...,27,Gomphocerussibiricus_XC751796-dat187-005_edit2...


In [143]:
glob.glob('/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/Roeselianaroeselii_XC751814-dat028-019_edit1*.flac')

['/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/Roeselianaroeselii_XC751814-dat028-019_edit1@conv_Plate 250_xcg1V2.flac',
 '/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/Roeselianaroeselii_XC751814-dat028-019_edit1@conv_Ocean_sog1v2.flac',
 '/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/Roeselianaroeselii_XC751814-dat028-019_edit1@conv_Waves Hallway_mcg1v2.flac',
 '/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/Roeselianaroeselii_XC751814-dat028-019_edit1@conv_TLV - C Room_SBg1v2.flac',
 '/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/Roeselianaroeselii_XC751814-dat028-019_edit1@conv_SOS Studio_xcg1v2.flac',
 '/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/Roeselianaroeselii_XC751814-dat028-019_edit1@conv_Cello Studio 1_MBg2v2.flac',
 '/root/data/gdsc_data/data_processed/data_8c86715/01_apply_ir_function/train/R