# Data Prep

In [1]:
import csv
import functools
import os
import random
from collections import Counter
from dataclasses import dataclass
import pandas as pd
from tqdm import tqdm


from speechbrain.dataio.dataio import (
    merge_csvs,
    read_audio_info,
)
from speechbrain.utils.data_utils import download_file, get_all_files
from speechbrain.utils.logger import get_logger
from speechbrain.utils.parallel import parallel_map

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_and_split_line(wav_file, split_interval) -> list:
    info = read_audio_info(wav_file)
    duration = info.num_frames
    split_interval = split_interval * info.sample_rate
    assert info.sample_rate == 16000
    new_rows = []
    start = 0 
    components = wav_file.split(os.sep)
    id_name = os.path.join(components[-2], components[-1])

    if split_interval != 0:
        while start < duration:
            stop = min(start + split_interval, duration)
            new_rows.append([
                id_name + str(start / info.sample_rate),
                wav_file,
                start,
                stop,
                (stop - start) / info.sample_rate,
            ])
            start = start + split_interval
    else:
        new_rows.append([
            id_name,
            wav_file,
            0,
            0,
            duration / info.sample_rate,
        ])
    
    return new_rows

In [13]:
save_location = "/users/rwhetten/african_brq/csvs"


In [11]:
csvs = [
    # "/users/fkponou/data/speechbrain/To_Ryan/cappfm.csv",
    # "/users/fkponou/data/speechbrain/To_Ryan/igbo.csv",
    # "/users/fkponou/data/speechbrain/To_Ryan/yor.csv",
    "/users/rwhetten/african_brq/csvs/igbo.csv",
    "/users/rwhetten/african_brq/csvs/yor.csv",
]

split_int = 30
dfs = {}
for ds in csvs:
    ds_name = os.path.splitext(os.path.basename(ds))[0]
    dfs[ds_name] = pd.read_csv(ds)

In [15]:
columns = ["ID", "wav", "start", "stop", "duration"]

processed_dfs = {}
for ds in dfs:
    data = []
    for file in tqdm(dfs[ds].file_path):
        data += process_and_split_line(file, split_int)

    df = pd.DataFrame(data, columns=columns)
    print(f"Hours in {ds}: {df.duration.sum() / 3600}")
    processed_dfs[ds] = df
    df.to_csv(f"{save_location}/{ds}_test.csv")


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 68850/68850 [02:36<00:00, 439.79it/s]


Hours in igbo: 103.79504000000001


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 44172/44172 [08:42<00:00, 84.52it/s]


Hours in yor: 72.5639


In [41]:
for ds in processed_dfs:
    processed_dfs[ds].to_csv(f"{save_location}/{ds}.csv", index=False)

In [17]:
fong = pd.read_csv("/users/rwhetten/african_brq/csvs/cappfm.csv")

In [19]:
pd.concat([fong, processed_dfs['igbo'], processed_dfs['yor']]).to_csv(f"{save_location}/f_i_y.csv", index=False)

In [22]:
all_csvs = pd.read_csv("/users/rwhetten/african_brq/csvs/f_i_y.csv")

In [25]:
all_csvs.duration.sum() / 3600

286.1103775520833

In [20]:
pd.read_csv("/users/rwhetten/african_brq/csvs/valid.csv").duration.sum() / 60 / 60

3.690030555555556

In [2]:
valid_path = "/users/rwhetten/african_brq/csvs/valid.csv"

In [5]:
valid_df = pd.read_csv(valid_path)

In [9]:
valid_df['start'] = [0] * len(valid_df)
valid_df['stop'] = [0] * len(valid_df)

In [14]:
valid_df.to_csv(f"{save_location}/valid_2.csv", index=False)