# Descripiton

## Setup

In [2]:
# Need newer torch to use all augmentation functions
%pip install torch==2.0.* torchaudio

Collecting torch==2.0.*
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.*)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting nvidia-cufft-cu11==10.9.0.58 (from torch==2.0.*)
  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.4/168.4 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-curand-cu11==10.2.10.91 (from torch==2.0.*)
  Downloading nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Reload to get installed packages

In [None]:
import os
os._exit(00)

## Imports

In [1]:
import glob
import os
import multiprocessing
import random

from itertools import repeat, product

from collections import namedtuple
from csv import writer

import torch
import torchaudio
import pandas as pd
import tqdm

import tqdm.contrib.itertools as tqdm_itertools
from IPython.display import Audio
from datasets import Audio as DSAudio, load_dataset

tqdm_chain = tqdm_itertools.itertools.chain

### Constants and parameters

In [2]:
SOURCE_SAMPLING_RATE = 44100
MODEL_SAMPLING_RATE = 32000

DATA_TAG = 'data_8c86715'

## Hyperparameters

## Base entities

### Datapoint struct

In [3]:
DataPoint = namedtuple('DataPoint', ['path', 'label', 'name'])

### Paths

In [4]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), '../../'))
data_source_base = os.path.join(base_dir, 'gdsc_data/data_source')
data_processed_base = os.path.join(base_dir, 'gdsc_data/data_processed')

In [5]:
train_data_source_path = os.path.join(data_source_base, 'train/') 
test_data_source_path = os.path.join(data_source_base, 'test/') 

In [6]:
rir_library_path = os.path.join(base_dir, 'audio_data/rir_library_flac_sampled/')

### Helper functions

In [7]:
def resample_source_wav(data: DataPoint, resampler):
    wav, _ = torchaudio.load(data.path)
    return resampler(wav)    


def iter_datapoint_from(source_data_path):
    metadata_file = os.path.join(source_data_path, 'metadata.csv')
    df_audio = pd.read_csv(metadata_file)
    for fname, label in df_audio.itertuples(index=False):
        full_path = os.path.join(source_data_path, fname)
        yield DataPoint(path=full_path, label=label, name=fname)
        
        
def rir_path_to_name_path(rir_path):
    name, ext = os.path.splitext(os.path.basename(rir_path))
    return name, rir_path
            

## Worker's init and main functions

In [8]:
def worker_init(worker_exec, source_dir, processed_dir):
    import torch
    import torchaudio
    torch.set_num_threads(1)
    
    worker_exec.torch = torch
    worker_exec.torchaudio = torchaudio
    worker_exec.source_dir = source_dir
    worker_exec.processed_dir = processed_dir
    worker_exec.resampler = torchaudio.transforms.Resample(SOURCE_SAMPLING_RATE, MODEL_SAMPLING_RATE)
    
    
def worker_main(rir_name, rir_path, data_point):
    # Load initalized libraries in a process
    torch = worker_main.torch
    torchaudio =  worker_main.torchaudio
    processed_dir = worker_main.processed_dir
    source_dir = worker_main.source_dir
    resampler = worker_main.resampler
    
    # convert to 32kHZ
    wav_resampled = resample_source_wav(data_point, resampler)
    
    # get the room IR, normalize it, then apply
    rir, _ = torchaudio.load(rir_path)
    rir = rir / torch.norm(rir, p=2)
    wav_resampled = torchaudio.functional.fftconvolve(wav_resampled, rir)
    
    # prepare for output
    source_path = data_point.path
    dp_label = data_point.label    
    dp_name = data_point.name
    source_name, source_ext = os.path.splitext(dp_name)
    
    # This will recreate train/ subfolder as well
    dest_name = f'{source_name}@conv_{rir_name}.flac'
    dest_relpath = os.path.dirname(os.path.relpath(source_path, source_dir))
    dest_fpath = os.path.join(processed_dir, dest_relpath, dest_name)  
    
    os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
    
    torchaudio.save(
        dest_fpath,
        wav_resampled,
        sample_rate=MODEL_SAMPLING_RATE,
        format="flac", 
        bits_per_sample=24)

    return dest_fpath, dest_name, dp_label 


def worker_starmap(args):
    return worker_main(*args)

## Scheduler

In [9]:
RIR_SAMPLE_SIZE = 20

rir_library = list(map(rir_path_to_name_path, glob.glob(os.path.join(rir_library_path, '**/*.flac'), recursive=True)))
source_library = iter_datapoint_from(train_data_source_path)

all_combinations = product(random.sample(rir_library, RIR_SAMPLE_SIZE), source_library)

source_library_length = iter_datapoint_from(train_data_source_path)
all_combinations_length = 0
for i, _ in enumerate(product(range(0, RIR_SAMPLE_SIZE), source_library_length), 1):
    all_combinations_length = i

work_generator = ((rir_name, rir_path, dp) for (rir_name, rir_path), dp in all_combinations)

In [10]:
print(f"Total possible rir+sound combinations {all_combinations_length}")

Total possible rir+sound combinations 35040


In [26]:
# if raises exception, make sure you delete the folder with all the data first
dest_path = os.path.join(data_processed_base, DATA_TAG, '01_apply_ir_function/')
os.stat(dest_path)

os.stat_result(st_mode=16877, st_ino=11323095158871739323, st_dev=58, st_nlink=3, st_uid=0, st_gid=0, st_size=6144, st_atime=1689006410, st_mtime=1689006410, st_ctime=1689006410)

In [23]:
print(f"Reading files from '{train_data_source_path}'")
print(f"Going to write into '{dest_path}'")

Reading files from '/root/data/gdsc_data/data_source/train/'
Going to write into '/root/data/gdsc_data/data_processed/data_8c86715/00_apply_ir_function/'


In [24]:
n_cpu = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=n_cpu, initializer=worker_init, initargs=(worker_main, data_source_base, dest_path))

rows = []
for _, dest_fname, label in tqdm.tqdm(pool.imap_unordered(worker_starmap, work_generator), total=all_combinations_length):
    rows.append((dest_fname, label))

del pool

df = pd.DataFrame(rows, columns=['file_name', 'label'])
df.to_csv(os.path.join(dest_path, 'metadata.csv'), index=False)

100%|██████████| 35040/35040 [1:09:00<00:00,  8.46it/s]


In [None]:
## Change ds a bit - map between original and all "variants" available as a comma separated list

In [27]:
df = pd.read_csv(os.path.join(dest_path, 'metadata.csv'))

In [28]:
def extract_original_filename(file_name):
    parts = file_name.split("@")
    if len(parts) == 0:
        return file_name
    
    return f"{parts[0]}.wav"

In [29]:
df["original_filename"] = df["file_name"].map(extract_original_filename)

In [30]:
df_grouped = df.groupby("original_filename").aggregate(
    {
        "file_name": lambda x: ", ".join(x),
        "label": min
    }
).reset_index().rename({"file_name": "file_list"})

In [31]:
df_grouped.to_csv(os.path.join(dest_path, 'metadata_grouped_train.csv'), index=False)

In [117]:
df_grouped["file_name"].map(lambda x: x.split(", "))[0]

['Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Barrymore_xcg1v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Wells Fargo Pavilion_scg1v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Fire Station_MQg1v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Apollo Theatre_sog2v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Rishon Hall_xcg1v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Lyceum Theater_MBg2v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Alexandrinsky_mcg1v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1@conv_Leas Cliff Hall_scg1v2.flac',
 'Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.

## Merge validation with train
We need more data for training, and we'll opt for a tiny validation dataset.
Then, the downsized validation dataset will undergo all the IR again, to make it bigger. 

We'll only take one sample per species for validation ds

In [None]:
val_data_source_path = os.path.join(data_source_base, 'val/')

In [114]:
df_grouped

Unnamed: 0,original_filename,file_name,label
0,Achetadomesticus_XC489192-Achetadomesticus_pol...,Achetadomesticus_XC489192-Achetadomesticus_pol...,0
1,Achetadomesticus_XC489192-Achetadomesticus_pol...,Achetadomesticus_XC489192-Achetadomesticus_pol...,0
2,Achetadomesticus_XC489193-Achetadomesticus_pol...,Achetadomesticus_XC489193-Achetadomesticus_pol...,0
3,Achetadomesticus_XC751735-dat001-056.wav,Achetadomesticus_XC751735-dat001-056@conv_Barr...,0
4,Achetadomesticus_XC751738-dat004-038.wav,Achetadomesticus_XC751738-dat004-038@conv_Barr...,0
...,...,...,...
1747,Yoyettarepetens_GBIF2901727728_IN62757977_1431...,Yoyettarepetens_GBIF2901727728_IN62757977_1431...,65
1748,Yoyettarepetens_GBIF2980855819_IN63945649_1462...,Yoyettarepetens_GBIF2980855819_IN63945649_1462...,65
1749,Yoyettarepetens_GBIF2984406883_IN64632242_1480...,Yoyettarepetens_GBIF2984406883_IN64632242_1480...,65
1750,Yoyettarepetens_GBIF2984556076_IN64757137_1483...,Yoyettarepetens_GBIF2984556076_IN64757137_1483...,65


## Legacy processing (TO BE REMOVED)

In [15]:
def split_nsec_chunks(data_point: DataPoint, sampling_rate, n_sec=10):
    dpath = data_point.path
    data, _ = torchaudio.load(dpath)
    name = data_point.name
    
    
    if data.shape[1] <= sampling_rate * n_sec:
        raw_name, ext = os.path.splitext(name)
        yield data, f"{raw_name}_1{ext}", data_point.label
        
        return
    
    for i, chunk in enumerate(torch.split(data, n_sec * sampling_rate, dim=1), 1):
        raw_name, ext = os.path.splitext(name)
        yield chunk, f"{raw_name}_{i}{ext}", data_point.label  
        
        
resampler = torchaudio.transforms.Resample(SOURCE_SAMPLING_RATE, MODEL_SAMPLING_RATE)   
def resample_wavs(data, fname, processed_dir, sr=MODEL_SAMPLING_RATE, sampler=resampler):
        resampled_wav = resampler(data)
        torchaudio.save(os.path.join(processed_dir, fname), resampled_wav, sample_rate=MODEL_SAMPLING_RATE)

        
def make_data_point(data_path):
    """ Assumes that metadata.csv lies along with the .wav files
    """
    metadata = os.path.join(data_path, 'metadata.csv')
    df_audio = pd.read_csv(metadata)
    
    for fname, label in df_audio.itertuples(index=False):
        full_path = os.path.join(data_path, fname)
        yield DataPoint(path=full_path, label=label, name=fname)


### Create new directories

In [10]:
os.makedirs(os.path.join(data_processed, 'train/'), exist_ok=True)
os.makedirs(os.path.join(data_processed, 'test/'), exist_ok=True)
os.makedirs(os.path.join(data_processed, 'val/'), exist_ok=True)

## Split files and append to metadata file

`tqdm` is really buggy. it seems to work for the first run, but doesn't run for a second iteration, unless variable names change. Therefore outcommented 

In [11]:
def preprocess_files(store_at, source_metadata_path):
    '''
    Moved to function. Need to reconsider how test dataset gets handled.
    Also REALLY want to get tdqm to work....DONE
    '''

    #(Re)create CSV file with headers
    with open(store_at + 'metadata.csv', 'w') as f_object:
        writer_object = writer(f_object)
        writer_object.writerow(['file_name', 'label'])
        f_object.close()
    
    # GSK: Would love to have this snippet running, but too inconsistent.
    #main_iterable_v234 = chain.from_iterable(map(split_nsec_chunks, make_data_point(_data, max_num=_max_num), repeat([SOURCE_SAMPLING_RATE])))
    #total_len = sum(1 for _ in main_iterable_v234)
    #print(total_len)
    #for data, fname, label in tqdm.tqdm(main_iterable_v234):#, total = total_len):
    for data, fname, label in tqdm.tqdm((map(split_nsec_chunks, make_data_point(source_metadata_path), repeat(SOURCE_SAMPLING_RATE)))):
        # Resample and save .wav
        resample_wavs(data, fname, store_at)
                
        # Write to CSV
        with open(store_at + 'metadata.csv', 'a') as f_object:
            writer_object = writer(f_object)
            writer_object.writerow([fname, label])
            f_object.close()

In [12]:
preprocess_files(store_at = os.path.join(data_processed, 'train/'), source_metadata_path = train_data_source_path)
preprocess_files(store_at = os.path.join(data_processed, 'val/'), source_metadata_path = val_data_source_path)

----------------------------
processing up to 10 files in /root/data/data_processed/train/
----------------------------


6479it [08:18, 12.98it/s]


----------------------------
processing up to 10 files in /root/data/data_processed/val/
----------------------------


2018it [02:31, 13.36it/s]


## Split the test dataset
Could be added to other methods, but likely requires lots of changes, and this only needs to be done once.

In [57]:
n_sec=10

# Read in the file locations
metadata = os.path.join(test_data_path, 'metadata.csv')
df_audio = pd.read_csv(metadata)

#(Re)create CSV file with headers
with open(os.path.join(data_processed, 'test/') + 'metadata.csv', 'w') as f_object:
    writer_object = writer(f_object)
    writer_object.writerow(['file_name'])
    f_object.close()

cnt = 0
# Go through each file
for fname in tqdm.tqdm(df_audio.itertuples(index=False)):
    #print(cnt)
    cnt = cnt +1 
    # Extract the information
    fname = fname[0]
    full_path = os.path.join(test_data_path, fname)
    data, _ = torchaudio.load(full_path)
    
    # Chunk it up
    for i, chunk in enumerate(torch.split(data, n_sec * SOURCE_SAMPLING_RATE, dim=1), 1):
        raw_name, ext = os.path.splitext(fname)
        outfile_name = f"{raw_name}_{i}{ext}"
        resample_wavs(chunk, outfile_name, os.path.join(data_processed, 'test/'), sr=MODEL_SAMPLING_RATE, sampler=resampler)    
        
        # Write to CSV
        with open(os.path.join(data_processed, 'test/') + 'metadata.csv', 'a') as f_object:
            writer_object = writer(f_object)
            writer_object.writerow([outfile_name])
            f_object.close()
            
#    if cnt > 5:
#        break

556it [02:09,  4.28it/s]


In [58]:
#Audio(full_path, rate = 22000)

#Audio(os.path.join(processed_dir, 'test/2_2.wav'), rate=32000)

# Old snippets I didn't want to delete just yet

### Backup, because I'm about to break it with TQDM: