In [16]:
# modules
import os
import pandas as pd
import numpy as np
from tqdm import  tqdm
from os.path import basename as bn, join, split as sp

from sklearn.utils import shuffle

# DATASET Path and Constants

In [17]:
# Read paths
ROOT_TIMIT_DATA_PATH = "/home/jeevan/datasets/TIMIT Acoustic-Phonetic Continuous Speech Corpus (LDC93S1)/TIMIT"

# Write paths
ALL_EXP_FOLDER = "./exports/"
(lambda fp : os.mkdir(fp) if not os.path.exists(fp) else 0)(ALL_EXP_FOLDER) #make export folder

# Vowel info Export CSV filename
ALL_TIMIT_VOWELS_IMP_FILENAME = "a_all-timit_vowels.csv"
ALL_TIMIT_VOWELS_IMP_FILEPATH = join(ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_IMP_FILENAME)

# Vowel subset Export CSV filename
SUBSET_TIMIT_VOWELS_EXP_FILENAME = "b_subset-timit_vowels_vowlimLIM.csv"
SUBSET_TIMIT_VOWELS_EXP_FILEPATH = join(ALL_EXP_FOLDER, SUBSET_TIMIT_VOWELS_EXP_FILENAME)

# TIMIT SAMPLING RATE
TIMIT_AUDIO_FS = 16000

### Import All TIMIT Vowel Info dataframe

In [18]:
ALL_TIMIT_VOWELS_DF = pd.read_csv(ALL_TIMIT_VOWELS_IMP_FILEPATH)
ALL_TIMIT_VOWEL_LIST = pd.unique(ALL_TIMIT_VOWELS_DF["vowel_name"])
print(ALL_TIMIT_VOWELS_DF["vowel_name"].value_counts())
ALL_TIMIT_VOWELS_DF.describe()


ix      11587
iy       9663
ih       6760
ae       5404
eh       5293
ax       4956
axr      4790
aa       4197
ao       4096
ay       3242
ah       3185
ey       3088
ow       2913
er       2846
ux       2488
oy        947
aw        945
uh        756
uw        725
ax-h      493
Name: vowel_name, dtype: int64


Unnamed: 0,index,start_sample,end_sample,duration_sample,start_second,end_second,duration_second
count,78374.0,78374.0,78374.0,78374.0,78374.0,78374.0,78374.0
mean,6.265688,24477.171166,26012.632455,1535.461288,1.529823,1.62579,0.095966
std,4.480745,15701.519448,15768.373059,778.745234,0.981345,0.985523,0.048672
min,0.0,417.0,793.0,74.0,0.026062,0.049563,0.004625
25%,3.0,11770.0,13284.0,960.0,0.735625,0.83025,0.06
50%,6.0,22360.0,23853.0,1387.0,1.3975,1.490813,0.086688
75%,9.0,34440.0,36040.0,1963.0,2.1525,2.2525,0.122688
max,24.0,112600.0,114980.0,7735.0,7.0375,7.18625,0.483438


### Apply filters: Duration Thr

In [19]:
MIN_SAMPLE_DUR = 1500
dur_filt = ALL_TIMIT_VOWELS_DF["duration_sample"] > MIN_SAMPLE_DUR
FILTERED_ALL_TIMIT_VOWELS_DF = ALL_TIMIT_VOWELS_DF.loc[dur_filt]
FILTERED_ALL_TIMIT_VOWELS_DF

Unnamed: 0,index,audio_filepath,wav_file,person_id,sex,start_sample,end_sample,duration_sample,start_second,end_second,duration_second,vowel_name
0,0,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA1.WAV,FAKS0,F,11240,12783,1543,0.702500,0.798937,0.096437,iy
1,1,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA1.WAV,FAKS0,F,14078,16157,2079,0.879875,1.009813,0.129938,ae
3,3,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA1.WAV,FAKS0,F,19962,21514,1552,1.247625,1.344625,0.097000,aa
4,4,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA1.WAV,FAKS0,F,26280,28591,2311,1.642500,1.786938,0.144437,uw
8,8,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA1.WAV,FAKS0,F,40546,42357,1811,2.534125,2.647312,0.113187,aa
...,...,...,...,...,...,...,...,...,...,...,...,...
78360,11,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX442.WAV,MTCS0,M,43090,45712,2622,2.693125,2.857000,0.163875,ae
78361,12,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX442.WAV,MTCS0,M,47571,49640,2069,2.973187,3.102500,0.129312,ae
78364,1,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX82.WAV,MTCS0,M,8647,10234,1587,0.540438,0.639625,0.099187,er
78371,8,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX82.WAV,MTCS0,M,26843,28490,1647,1.677687,1.780625,0.102938,ay


## FUNCTION: Make Vowel info Subset

In [20]:
def make_vowel_subset(vowel: str, df: pd.DataFrame,  limit) -> pd.DataFrame:
    filt = df["vowel_name"] == vowel
    vowel_df = df[filt]

    filt_sex = lambda s: vowel_df["sex"] == s
    male_vowel_df = shuffle(vowel_df[filt_sex("M")], random_state=6)
    female_vowel_df = shuffle(vowel_df[filt_sex("F")], random_state=8)

    m_size = limit if len(male_vowel_df) > limit else len(male_vowel_df)
    f_size = limit if len(female_vowel_df) > limit else len(female_vowel_df)

    male_vowel_df   = male_vowel_df[:m_size]
    female_vowel_df = female_vowel_df[:f_size]

    vowel_df = pd.concat([male_vowel_df, female_vowel_df]) 

    return vowel_df
    
make_vowel_subset(
    vowel=ALL_TIMIT_VOWEL_LIST[np.random.randint(0, len(ALL_TIMIT_VOWEL_LIST))],
    df=FILTERED_ALL_TIMIT_VOWELS_DF,
    limit=20
)

Unnamed: 0,index,audio_filepath,wav_file,person_id,sex,start_sample,end_sample,duration_sample,start_second,end_second,duration_second,vowel_name
73456,7,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX421.WAV,MREM0,M,30919,33003,2084,1.932438,2.062688,0.13025,aa
31597,2,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX443.WAV,MKDT0,M,11640,14333,2693,0.7275,0.895813,0.168313,aa
68949,5,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX245.WAV,MBBR0,M,14997,16960,1963,0.937312,1.06,0.122688,aa
30861,2,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX415.WAV,MJMA0,M,7436,9134,1698,0.46475,0.570875,0.106125,aa
52097,21,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SI1433.WAV,MSMS0,M,59640,61720,2080,3.7275,3.8575,0.13,aa
77727,13,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX341.WAV,MMPM0,M,43841,46097,2256,2.740063,2.881063,0.141,aa
74121,2,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SI2267.WAV,MSDB0,M,9970,11880,1910,0.623125,0.7425,0.119375,aa
14416,6,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SI1213.WAV,MKLT0,M,19000,20929,1929,1.1875,1.308062,0.120563,aa
48496,0,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX337.WAV,MJEE0,M,2360,5560,3200,0.1475,0.3475,0.2,aa
19149,8,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA1.WAV,MRMS1,M,31640,34450,2810,1.9775,2.153125,0.175625,aa


### Create and export Vowel Info SUBSET dataframe

In [21]:
VOWEL_LIMIT = 500
vowel_df = FILTERED_ALL_TIMIT_VOWELS_DF
SUBSET_TIMIT_VOWELS_DF = pd.concat([make_vowel_subset(vowel, limit=VOWEL_LIMIT, df=vowel_df) for vowel in tqdm(ALL_TIMIT_VOWEL_LIST)])

SUBSET_TIMIT_VOWELS_DF.reset_index(inplace=True)
SUBSET_TIMIT_VOWELS_DF.index.name = "index"
SUBSET_TIMIT_VOWELS_DF.drop(columns=["index", "level_0"], inplace=True)

SUBSET_TIMIT_VOWELS_DF.to_csv(SUBSET_TIMIT_VOWELS_EXP_FILEPATH.replace("LIM", f"{VOWEL_LIMIT}"), index=True)

SUBSET_TIMIT_VOWELS_DF 

100%|██████████| 20/20 [00:00<00:00, 100.03it/s]


Unnamed: 0_level_0,audio_filepath,wav_file,person_id,sex,start_sample,end_sample,duration_sample,start_second,end_second,duration_second,vowel_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX282.WAV,MMDM2,M,10360,11880,1520,0.647500,0.742500,0.095000,iy
1,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX404.WAV,MWJG0,M,24647,26819,2172,1.540437,1.676187,0.135750,iy
2,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX327.WAV,MMEB0,M,10330,12825,2495,0.645625,0.801562,0.155938,iy
3,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX129.WAV,MDLR1,M,37633,41082,3449,2.352063,2.567625,0.215562,iy
4,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX208.WAV,MCTT0,M,35861,38298,2437,2.241312,2.393625,0.152312,iy
...,...,...,...,...,...,...,...,...,...,...,...
15372,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX92.WAV,MSFV0,M,40647,43446,2799,2.540437,2.715375,0.174937,ax-h
15373,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX67.WAV,MJEE0,M,55915,58171,2256,3.494687,3.635687,0.141000,ax-h
15374,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX403.WAV,MTMR0,M,41023,43102,2079,2.563938,2.693875,0.129938,ax-h
15375,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SI1448.WAV,MRJM3,M,25483,27080,1597,1.592688,1.692500,0.099812,ax-h


### Stats

In [22]:
SUBSET_TIMIT_VOWELS_DF["vowel_name"].value_counts()

iy      1000
ao      1000
ey      1000
ay      1000
ae      1000
ow      1000
eh      1000
ih      1000
aa      1000
er      1000
axr      927
ux       926
ah       864
oy       787
aw       781
ix       411
uw       389
uh       175
ax       112
ax-h       5
Name: vowel_name, dtype: int64