In [1]:
import pathlib

import pandas as pd
from tqdm.notebook import tqdm
import vak

In [2]:
cd /home/ildefonso/Documents/repos/vocalpy/2023-messy-experiments

/home/ildefonso/Documents/repos/vocalpy/2023-messy-experiments


First we make a constant representing the path to the root of the original dataset, and from there we get the two csv files that contain the train and test splits.

We load those csv files into `pandas.DataFrame`s.

## 1. Prep source files

After doing this once you should be able to re-load the csv for step 2 below.

In [3]:
import crowsetta


TIMIT_TRANSCRIBER = crowsetta.Transcriber(format='timit')


def simpleseq_from_timit_phn_path(
    annot_path: pathlib.Path
) -> crowsetta.formats.seq.SimpleSeq:
    a_timit = TIMIT_TRANSCRIBER.from_file(annot_path)
    seq = a_timit.to_seq()
    a_simpleseq = crowsetta.formats.seq.SimpleSeq(
        onsets_s=seq.onsets_s,
        offsets_s=seq.offsets_s,
        labels=seq.labels,
        annot_path=a_timit.annot_path
    )
    return a_simpleseq

In [4]:
import librosa
import numpy as np


def get_frame_features(audio_path, hop_length_s=0.001, n_fft_s=0.025, n_mels=40, n_mfcc=13):
    """Converts audio into framewise features

    Adapted from https://github.com/felixkreuk/SegFeat

    > ... we extracted 13 Mel-Frequency Cepstrum Coefficients (MFCCs), 
    with delta and delta-delta features every 10 ms, 
    with a processing window size of 10ms.

    Note this funciton uses a hop length of 1 ms, not 10 ms.
    
    > Moreover, we concatenated four additional features based
    on the spectral changes between adjacent frames, using
    MFCCs to represent the spectral properties of the frames.
    Define Dt,j = d(at−j , at+j ) to be the Euclidean distance
    between the MFCC feature vectors at−j and at+j , where
    at ∈ R39 for 1 ≤ t ≤ T . The features are denoted by Dt,j ,
    for j ∈ {1, 2, 3, 4}. We observed this set of features greatly
    improves performance over the standard MFCC features.

    See also: https://groups.google.com/g/librosa/c/V4Z1HpTKn8Q/m/1-sMpjxjCSoJ
    """
    y, sr = librosa.load(audio_path, sr=None)
    hop_length = int(hop_length_s * sr)
    n_fft = int(n_fft_s * sr)
    spect = librosa.feature.mfcc(y=y,
                                 sr=sr,
                                 n_fft=n_fft,
                                 hop_length=hop_length,
                                 n_mels=n_mels,
                                 n_mfcc=n_mfcc
                                )

    delta  = librosa.feature.delta(spect, order=1)
    delta2 = librosa.feature.delta(spect, order=2)
    spect  = np.concatenate([spect, delta, delta2], axis=0)
    dist = []
    for i in range(2, 9, 2):
        pad = int(i/2)
        d_i = np.concatenate([np.zeros(pad), ((spect[:, i:] - spect[:, :-i]) ** 2).sum(0) ** 0.5, np.zeros(pad)], axis=0)
        dist.append(d_i)
    dist = np.stack(dist)
    frames = np.concatenate([spect, dist], axis=0)
    times = librosa.frames_to_time(
        np.arange(frames.shape[1]), sr=sr, hop_length=hop_length, n_fft=n_fft
    )
    return frames, times

In [5]:
TIMIT_ROOT = pathlib.Path("./data/raw/timit.deepai/")
TIMIT_DATA_ROOT = TIMIT_ROOT / 'data'

In [6]:
annot_paths = sorted((TIMIT_DATA_ROOT / "TRAIN").glob("**/*PHN"))

In [8]:
def prep_files_for_split(split_name, dst):
    annot_paths = sorted((TIMIT_DATA_ROOT / split_name.upper()).glob("**/*PHN"))
    audio_paths = sorted((TIMIT_DATA_ROOT / split_name.upper()).glob("**/*WAV.wav"))
    assert len(annot_paths) == len(audio_paths)

    records = []
    timebin_dur = None
    for audio_path, annot_path in tqdm(zip(audio_paths, annot_paths)):
        # get strings for file names
        dialect_region = annot_path.parents[1].name
        speaker = annot_path.parents[0].name

        frames, times = get_frame_features(audio_path)
        if timebin_dur is None:
            timebin_dur = np.diff(times).mean()
        fname = f"{dialect_region}-{speaker}-{audio_path.name}.spect.npz"
        frames_path = dst / fname
        np.savez(frames_path, s=frames, t=times, f=np.arange(frames.shape[0]))
        
        a_simpleseq = simpleseq_from_timit_phn_path(annot_path)
        simpleseq_path = dst / f"{dialect_region}-{speaker}-{annot_path.stem}.csv"
        a_simpleseq.to_file(simpleseq_path)

        records.append(
            {
                'audio_path': audio_path,
                "spect_path": frames_path,
                "annot_path": simpleseq_path,
                "annot_format": "simple-seq",
                "spect_dur": frames.shape[1] * timebin_dur,
                "timebin_dur": timebin_dur,
            }
        )

    source_paths_df = pd.DataFrame.from_records(records)
    return source_paths_df

We set a destination (`DST`) for the files we will generate.

In [9]:
DST = pathlib.Path("./data/spectrograms-annotations/timit.deepai")
DST.mkdir(exist_ok=True, parents=True)

In [10]:
source_files_df = []

for split_name in ('train', 'test'):
    source_paths_df = prep_files_for_split(split_name, DST)
    source_paths_df['split'] = split_name
    if split_name == 'train':
        val_inds = np.random.randint(low=len(source_paths_df), size=int(len(source_paths_df) * 0.1))
        source_paths_df.loc[val_inds, 'split'] = 'val'
    source_files_df.append(source_paths_df)

source_files_df = pd.concat(source_files_df)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [11]:
source_files_csv_path = DST / f"{DST.name}_prep_{vak.common.timenow.get_timenow_as_str()}.csv"

In [12]:
source_files_df.to_csv(source_files_csv_path)

## 2. Make frame classification dataset from sourc files

In [13]:
source_files_csv_path = sorted(DST.glob(f"{DST.name}_prep_*.csv"))
assert len(source_files_csv_path) > 0, "No source files csv paths"
source_files_csv_path = source_files_csv_path[-1]  # use most recent

In [14]:
source_files_df = pd.read_csv(source_files_csv_path)

In [19]:
import json


labelmap_json_path = dataset_path / "labelmap.json"


if not labelmap_json_path.exists():
    SIMPLESEQ_SCRIBE = crowsetta.Transcriber(format='simple-seq')
    annot_paths = source_files_df['annot_path'].values
    labelset = set(
        [lbl 
         for annot_path in annot_paths 
         for lbl in SIMPLESEQ_SCRIBE.from_file(annot_path).to_seq().labels
        ]
    )
    labelmap = vak.common.labels.to_map(labelset, map_unlabeled=True)
    with labelmap_json_path.open("w") as fp:
        json.dump(labelmap, fp)
else:
    with labelmap_json_path.open("r") as fp:
        labelmap = json.load(fp)

In [20]:
dataset_path = pathlib.Path(
    "./data/prep/multiclass/timit.deepai/generated_20240323"
)

In [21]:
dataset_path.mkdir(exist_ok=True, parents=True)

In [22]:
dataset_df = vak.prep.frame_classification.make_splits.make_splits(
    dataset_df=source_files_df,
    dataset_path=dataset_path,
    input_type='spect',
    purpose='train',
    labelmap=labelmap,
)

[########################################] | 100% Completed | 16.44 s
[########################################] | 100% Completed | 18.61 s
[########################################] | 100% Completed | 15.26 s


In [23]:
dataset_csv_path = vak.prep.dataset_df_helper.get_dataset_csv_path(
    dataset_path, 'timit.deepai', '2024-03-23'
)
dataset_df.to_csv(
    dataset_csv_path, index=False
)  # index is False to avoid having "Unnamed: 0" column when loading


In [27]:
frame_dur = vak.prep.frame_classification.validators.validate_and_get_frame_dur(
    dataset_df, "spect"
)

metadata = vak.datasets.frame_classification.Metadata(
    dataset_csv_filename=str(dataset_csv_path.name),
    frame_dur=frame_dur,
    input_type="spect",
    audio_format="wav",
    spect_format="npz",
)
metadata.to_json(dataset_path)