# Data Prep

In [1]:
import csv
import functools
import os
import random
from collections import Counter
from dataclasses import dataclass
import pandas as pd
from tqdm import tqdm


from speechbrain.dataio.dataio import (
    merge_csvs,
    read_audio_info,
)
from speechbrain.utils.data_utils import download_file, get_all_files
from speechbrain.utils.logger import get_logger
from speechbrain.utils.parallel import parallel_map

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_and_split_line(wav_file, split_interval) -> list:
    info = read_audio_info(wav_file)
    duration = info.num_frames
    split_interval = split_interval * info.sample_rate
    assert info.sample_rate == 16000
    new_rows = []
    start = 0 
    components = wav_file.split(os.sep)
    id_name = os.path.join(components[-2], components[-1])

    if split_interval != 0:
        while start < duration:
            stop = min(start + split_interval, duration)
            new_rows.append([
                id_name + str(start / info.sample_rate),
                wav_file,
                start,
                stop,
                (stop - start) / info.sample_rate,
            ])
            start = start + split_interval
    else:
        new_rows.append([
            id_name,
            wav_file,
            0,
            0,
            duration / info.sample_rate,
        ])
    
    return new_rows

In [3]:
save_location = "/users/rwhetten/african_brq/csvs_jz/new"


In [17]:
csvs = [
    # "/users/fkponou/data/speechbrain/To_Ryan/cappfm.csv",
    # "/users/rwhetten/african_brq/csvs_jz/librispeech/train.csv",
    # "/users/rwhetten/african_brq/csvs_jz/librispeech/dev-clean.csv",
    # "/users/fkponou/data/speechbrain/To_Ryan/igbo.csv",
    "/users/rwhetten/african_brq/csvs/yor_org.csv",
]

split_int = 20
dfs = {}
for ds in csvs:
    ds_name = os.path.splitext(os.path.basename(ds))[0]
    dfs[ds_name] = pd.read_csv(ds)

In [18]:
columns = ["ID", "wav", "start", "stop", "duration"]

processed_dfs = {}
for ds in dfs:
    data = []
    for file in tqdm(dfs[ds].file_path):
        data += process_and_split_line(file, split_int)

    df = pd.DataFrame(data, columns=columns)
    print(f"Hours in {ds}: {df.duration.sum() / 3600}")
    processed_dfs[ds] = df
    df.to_csv(f"{save_location}/{ds}_test.csv", index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 44172/44172 [02:25<00:00, 302.97it/s]


Hours in yor: 72.5639


In [19]:
# for ds in processed_dfs:
#     processed_dfs[ds].to_csv(f"{save_location}/{ds}.csv", index=False)

In [20]:
fong = pd.read_csv("/users/rwhetten/african_brq/csvs_jz/new/cappfm.csv")
i = pd.read_csv("/users/rwhetten/african_brq/csvs_jz/new/igbo.csv")
y = pd.read_csv("/users/rwhetten/african_brq/csvs_jz/new/yor.csv")
ls = pd.read_csv("/users/rwhetten/african_brq/csvs_jz/new/train.csv")


In [21]:
pd.concat([fong, i, y]).to_csv(f"{save_location}/fiy.csv", index=False)
pd.concat([fong, i, y, ls]).to_csv(f"{save_location}/efiy.csv", index=False)

In [23]:
fiy = pd.read_csv(f"{save_location}/fiy.csv")
fiy.duration.sum() / 3600

286.1103775520834

In [24]:
efiy = pd.read_csv(f"{save_location}/efiy.csv")
efiy.duration.sum() / 3600

1247.1647759548612

In [29]:
pd.read_csv("/users/rwhetten/african_brq/csvs/finetune/train.csv").duration.sum() / 60 / 60

26.292688888888893

In [2]:
valid_path = "/users/rwhetten/african_brq/csvs/valid.csv"

In [5]:
valid_df = pd.read_csv(valid_path)

In [9]:
valid_df['start'] = [0] * len(valid_df)
valid_df['stop'] = [0] * len(valid_df)

In [14]:
valid_df.to_csv(f"{save_location}/valid_2.csv", index=False)

## Test Noise Aug

In [2]:
from speechbrain.dataio.dataio import read_audio
from speechbrain.augment.time_domain import AddNoise, AddReverb
import IPython
import torch

In [37]:
d = pd.read_csv("/users/rwhetten/african_brq/csvs/finetune/train.csv")

In [40]:
signal = read_audio(d.wav[0])
signal.shape

torch.Size([45056])

In [39]:
IPython.display.Audio(d.wav[0])

torch.Size([1, 45056])

In [93]:
clean = signal.unsqueeze(0) # [batch, time, channels]
noisifier = AddNoise(
    '/users/rwhetten/african_brq/dataaug/noise.csv', 
    snr_low=10,
    snr_high=15,
)
noisy = noisifier(clean, torch.ones(1))

In [92]:
IPython.display.Audio(noisy.numpy(), rate=16000)

In [96]:
noisifier = AddNoise(
    '/users/rwhetten/african_brq/dataaug/speech.csv', 
    snr_low=10,
    snr_high=15,
)
noisy = noisifier(clean, torch.ones(1))

In [97]:
IPython.display.Audio(noisy.numpy(), rate=16000)

## Test Reverb

In [3]:
from speechbrain.augment.preparation import prepare_dataset_from_URL

In [102]:
RIR_DATASET_URL = 'https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1'


prepare_dataset_from_URL(
    URL=RIR_DATASET_URL,
    dest_folder="/users/rwhetten/african_brq/dataaug/rir",
    ext="wav",
    csv_file="rir.csv",
)

Downloading https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1 to /users/rwhetten/african_brq/dataaug/rir/data.zip


RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1: 246MB [00:09, 27.1MB/s]                                                                 


Extracting /users/rwhetten/african_brq/dataaug/rir/data.zip to /users/rwhetten/african_brq/dataaug/rir


In [104]:
pd.read_csv('rir.csv').head()

Unnamed: 0,ID,duration,wav,wav_format,wav_opts
0,Room045-00055_0,1.0,/users/rwhetten/african_brq/dataaug/rir/RIRs/s...,wav,
1,Room045-00071_1,1.0,/users/rwhetten/african_brq/dataaug/rir/RIRs/s...,wav,
2,Room045-00040_2,1.0,/users/rwhetten/african_brq/dataaug/rir/RIRs/s...,wav,
3,Room045-00012_3,1.0,/users/rwhetten/african_brq/dataaug/rir/RIRs/s...,wav,
4,Room045-00014_4,1.0,/users/rwhetten/african_brq/dataaug/rir/RIRs/s...,wav,


In [106]:
reverb = AddReverb('rir.csv')
reverbed = reverb(clean)
IPython.display.Audio(reverbed.numpy(), rate=16000)

In [20]:
import numpy as np

In [38]:
b = "/users/rwhetten/african_brq/before_aug.npy"
a = "/users/rwhetten/african_brq/after_aug.npy"

before = np.load(b)
after = np.load(a)

In [39]:
before[0]

array([-0.18460709, -0.02245504,  0.0662688 , ...,  0.        ,
        0.        ,  0.        ], dtype=float32)

In [40]:
after[0]

array([-0.05838 ,  0.008156,  0.01822 , ...,  0.02057 ,  0.02058 ,
        0.00929 ], dtype=float16)

In [41]:
IPython.display.Audio(before[1], rate=16000)

In [43]:
IPython.display.Audio(after[1], rate=16000)

In [47]:
save_location = "/users/rwhetten/african_brq/store"

fong = pd.read_csv("/users/rwhetten/african_brq/store/tgts_cappfm.csv")
i = pd.read_csv("/users/rwhetten/african_brq/store/tgts_igbo.csv")
y = pd.read_csv("/users/rwhetten/african_brq/store/tgts_yor.csv")
ls = pd.read_csv("/users/rwhetten/african_brq/store/tgts_train.csv")
pd.concat([fong, i, y]).to_csv(f"{save_location}/tgts_fiy.csv", index=False)
pd.concat([fong, i, y, ls]).to_csv(f"{save_location}/tgts_efiy.csv", index=False)

In [48]:
fong.duration.sum() / 3600

109.75143755208335

In [50]:
fiy = pd.read_csv("/users/rwhetten/african_brq/store/tgts_fiy.csv")
fiy.duration.sum() / 3600

286.1103775520834

In [51]:
efiy = pd.read_csv("/users/rwhetten/african_brq/store/tgts_efiy.csv")
efiy.duration.sum() / 3600

1247.1647759548612

In [52]:
23250 / 250

93.0

In [54]:
93 * 600

55800