In [1]:
# Import required libraries and module for data preprocessing
import pandas as pd
import numpy as np
import soundfile as sf
import librosa
import os

In [19]:
# Function for extracting and combining labels from .lab files into a DataFrame
def load_ultrasuite_labels(src_directory, src_dataset):
    
    directory = src_directory + src_dataset + '/word_labels/lab/'
    columns = ['start_time', 'end_time', 'utterance']
    all_labels_df = pd.DataFrame()

    for filename in os.listdir(directory):
    
        filepath = directory + filename
    
        labels_df = pd.read_csv(filepath, sep=" ", header=None, names=columns)
    
        # Extract the speaker, session and speech data from the filename and add to the dataframe
        labels_df['dataset'] = src_dataset
        labels_df['speaker'] = filename[0:3]
        if len(filename[4:-9]) == 0:
            labels_df['session'] = None
        else:
            labels_df['session'] = filename[4:-9]
        labels_df['speech_waveform'] = filename[-8:-4]

        # Tidy up data formatting and correct time based units
        labels_df['utterance'] = labels_df['utterance'].str.lower()
        labels_df['start_time'] = pd.to_timedelta(labels_df['start_time'] * 100)
        labels_df['end_time'] = pd.to_timedelta(labels_df['end_time'] * 100)

        # Append incoming labels to existing dataframe
        all_labels_df = all_labels_df.append(labels_df, ignore_index=True)
    
    return all_labels_df

In [20]:
# Load the labels for the Ultrax Speech Sound Disorders dataset
uxssd_df = load_ultrasuite_labels('data/ultrasuite/labels-uxtd-uxssd-upx/', 'uxssd')

In [21]:
# Preview the data
uxssd_df.head()

Unnamed: 0,start_time,end_time,utterance,dataset,speaker,session,speech_waveform
0,00:00:01.340000,00:00:02.040000,th,uxssd,02M,BL1,069B
1,00:00:02.460000,00:00:03.350000,atha,uxssd,02M,BL1,069B
2,00:00:03.790000,00:00:04.650000,eethee,uxssd,02M,BL1,069B
3,00:00:05.210000,00:00:06.110000,otho,uxssd,02M,BL1,069B
4,00:00:00.970000,00:00:01.480000,core,uxssd,04M,Maint1,017A


In [22]:
# Load the labels for the Ultrax Typically Developing dataset
uxtd_df = load_ultrasuite_labels('data/ultrasuite/labels-uxtd-uxssd-upx/', 'uxtd')

In [26]:
# Preview the data
uxtd_df.head()

Unnamed: 0,start_time,end_time,utterance,dataset,speaker,session,speech_waveform
0,00:00:07.300000,00:00:08.180000,watch,uxtd,37M,,001A
1,00:00:08.270000,00:00:09.219999,fishing,uxtd,37M,,001A
2,00:00:09.539999,00:00:10.500000,gloves,uxtd,37M,,001A
3,00:00:10.640000,00:00:11.520000,spider,uxtd,37M,,001A
4,00:00:01.170000,00:00:02.020000,r,uxtd,30F,,010B


In [25]:
# Preview the dataframe info
uxtd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6094 entries, 0 to 6093
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype          
---  ------           --------------  -----          
 0   start_time       6094 non-null   timedelta64[ns]
 1   end_time         6094 non-null   timedelta64[ns]
 2   utterance        6094 non-null   object         
 3   dataset          6094 non-null   object         
 4   speaker          6094 non-null   object         
 5   session          0 non-null      object         
 6   speech_waveform  6094 non-null   object         
dtypes: object(5), timedelta64[ns](2)
memory usage: 333.4+ KB


In [24]:
# Load the labels for the Ultraphonix dataset
upx_df = load_ultrasuite_labels('data/ultrasuite/labels-uxtd-uxssd-upx/', 'upx')

In [27]:
# Preview the data
upx_df.head()

Unnamed: 0,start_time,end_time,utterance,dataset,speaker,session,speech_waveform
0,00:00:00,00:00:01.400000,sigh,upx,20M,Post,012A
1,00:00:01.639999,00:00:02.910000,sausages,upx,20M,Post,012A
2,00:00:03.140000,00:00:03.970000,snail,upx,20M,Post,012A
3,00:00:04.890000,00:00:05.699999,beige,upx,20M,Post,012A
4,00:00:00.510000,00:00:01.240000,sack,upx,16M,BL3,016A


In [None]:
def extract_segments(y, sr, segments):
    # compute segment regions in number of samples
    starts = np.floor(segments.start_time.dt.total_seconds() * sr).astype(int)
    ends = np.ceil(segments.end_time.dt.total_seconds() * sr).astype(int)
    
    file_path = 'data/transformed/'
    
    i = 0
    # slice the audio into segments
    for start, end in zip(starts, ends):
        audio_seg = y[start:end]
        print('extracting audio segment:', len(audio_seg), 'samples')
        
        file_name = file_path + str(segments.utterance[i]) + ".wav"
        
        sf.write(file_name, audio_seg, sr)
        i += 1

In [None]:
path = 'data/ultrasuite/core-uxssd-sample/core/01M/BL1/001A.wav'
y, sr = librosa.load(path, sr=22050)
extract_segments(y, sr, ultrasuite_df)