In [2]:
from tqdm.auto import tqdm

import os
import numpy as np
import pandas as pd

# Dataset

In [3]:
timit_path = '/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/'

In [4]:
data_path = '/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data/'

In [5]:
df = pd.concat(
    objs=[
        pd.read_csv(os.path.join(timit_path, 'train_data.csv'), index_col=0),
        pd.read_csv(os.path.join(timit_path, 'test_data.csv'), index_col=0)
    ]
)

df = df[
    (df['is_converted_audio'] == False) 
    ].reset_index()

In [6]:
print(df.shape)
df.head()

(25200, 12)


Unnamed: 0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
0,2.0,TRAIN,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,TRAIN\\DR4\\MMDM0\\SI1311.PHN,False,False,False,True,False
1,3.0,TRAIN,DR4,MMDM0,SI1311.WRD,TRAIN/DR4/MMDM0/SI1311.WRD,TRAIN\\DR4\\MMDM0\\SI1311.WRD,False,False,True,False,False
2,4.0,TRAIN,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,TRAIN\\DR4\\MMDM0\\SX321.PHN,False,False,False,True,False
3,5.0,TRAIN,DR4,MMDM0,SX321.WRD,TRAIN/DR4/MMDM0/SX321.WRD,TRAIN\\DR4\\MMDM0\\SX321.WRD,False,False,True,False,False
4,6.0,TRAIN,DR4,MMDM0,SI681.TXT,TRAIN/DR4/MMDM0/SI681.TXT,TRAIN\\DR4\\MMDM0\\SI681.TXT,False,False,False,False,True


In [7]:
df['test_or_train'].value_counts()

TRAIN    18480
TEST      6720
Name: test_or_train, dtype: int64

In [8]:
df.duplicated().sum()

0

# Audio/Phonemes/Words existence

In [9]:
phoneme_present_df = df[df['is_phonetic_file'].fillna(False)]
phoneme_present_df.shape

(3360, 12)

In [10]:
word_present = df[df['is_word_file'].fillna(False)]
word_present.shape

(6300, 12)

In [11]:
audio_present_df = df[df['is_audio'].fillna(False)]
audio_present_df.shape

(6300, 12)

# Form audio-phonemes-words groups

In [12]:
data = {}

for idx, row in tqdm(df.iterrows()):
    path = row['path_from_data_dir']
    entry_id = path.split('.')[0]
    

    if entry_id not in data:
        data[entry_id] = {}

    if row['is_audio'] is True:
        data[entry_id]['audio_file'] = os.path.join(data_path, path)
    elif row['is_word_file'] is True:
        data[entry_id]['word_file'] = os.path.join(data_path, path)
    elif row['is_phonetic_file'] is True:
        data[entry_id]['phonetic_file'] = os.path.join(data_path, path)

0it [00:00, ?it/s]

# Test set Analysis

In [39]:
# in a matter of correct counting enable only phonemes file as each row can be any of audio/word/phonetics file with same speakers/dialects

df_phonemes = df[
    (df['is_converted_audio'] == False) & \
    (df['is_phonetic_file'] == True)
    ].reset_index()

print(df_phonemes.shape)
df_phonemes.head()

(3360, 14)


Unnamed: 0,level_0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file,entry_id
0,0,2.0,TRAIN,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,TRAIN\\DR4\\MMDM0\\SI1311.PHN,False,False,False,True,False,TRAIN/DR4/MMDM0/SI1311
1,2,4.0,TRAIN,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,TRAIN\\DR4\\MMDM0\\SX321.PHN,False,False,False,True,False,TRAIN/DR4/MMDM0/SX321
2,6,9.0,TRAIN,DR4,MMDM0,SX51.PHN,TRAIN/DR4/MMDM0/SX51.PHN,TRAIN\\DR4\\MMDM0\\SX51.PHN,False,False,False,True,False,TRAIN/DR4/MMDM0/SX51
3,9,13.0,TRAIN,DR4,MMDM0,SX231.PHN,TRAIN/DR4/MMDM0/SX231.PHN,TRAIN\\DR4\\MMDM0\\SX231.PHN,False,False,False,True,False,TRAIN/DR4/MMDM0/SX231
4,14,19.0,TRAIN,DR4,MMDM0,SX141.PHN,TRAIN/DR4/MMDM0/SX141.PHN,TRAIN\\DR4\\MMDM0\\SX141.PHN,False,False,False,True,False,TRAIN/DR4/MMDM0/SX141


In [40]:
df_train = df_phonemes[df_phonemes['test_or_train'] == 'TRAIN'].copy()
df_test = df_phonemes[df_phonemes['test_or_train'] == 'TEST'].copy()

## Speakers Intersection

In [None]:
train_speakers = set(df_train['speaker_id'].unique())
test_speakers = set(df_test['speaker_id'].unique())

train_speakers & test_speakers

set()

## Dialects Intersection

In [41]:
train_dialects = set(df_train['dialect_region'].unique())
test_dialects = set(df_test['dialect_region'].unique())

print(f'train_dialects: {train_dialects}')
print(f'test_dialects: {test_dialects}')
print()
print(f'train_dialects & test_dialects: {train_dialects & test_dialects}') # same dialects as in the train set
print(f'test_dialects - train_dialects: {test_dialects - train_dialects}') # new dialects in the test set

train_dialects: {'DR3', 'DR4', 'DR2'}
test_dialects: {'DR2', 'DR1', 'DR7', 'DR6', 'DR5', 'DR3', 'DR8', 'DR4'}

train_dialects & test_dialects: {'DR3', 'DR4', 'DR2'}
test_dialects - train_dialects: {'DR1', 'DR6', 'DR7', 'DR8', 'DR5'}


As we see, our test set fully consists of new speakers and 6 new dialects, which mean we have to validate our model on dialects and speakers, which were not present during training.

# Split Dataset

In [42]:
df_train = df_phonemes[
    (df_phonemes['test_or_train'] == 'TRAIN') & \
    (df_phonemes['dialect_region'] != 'DR2')
    ].copy()

df_valid = df_phonemes[
    (df_phonemes['test_or_train'] == 'TRAIN') & \
    (df_phonemes['dialect_region'] == 'DR2')
    ].copy()

df_test = df_phonemes[df_phonemes['test_or_train'] == 'TEST'].copy()


print(f"Train: {len(df_train)}, Valid: {len(df_valid)}, Test: {len(df_test)}")

Train: 1440, Valid: 240, Test: 1680


## Proof of Split

In [59]:
valid_speakers = set(df_valid['speaker_id'].unique())
train_speakers = set(df_train['speaker_id'].unique())

overlap = valid_speakers & train_speakers  # Intersection of sets

if overlap:
    print("Overlap found:", overlap)
else:
    print("No overlap between validation and train speaker IDs.")

No overlap between validation and train speaker IDs.


In [60]:
valid_dialects = set(df_valid['dialect_region'].unique())
train_dialects = set(df_train['dialect_region'].unique())

overlap = valid_dialects & train_dialects  # Intersection of sets

if overlap:
    print("Overlap found:", overlap)
else:
    print("No overlap between validation and train dialect regions.")

No overlap between validation and train dialect regions.


# Fix Split

In [61]:
#  from initial df to create json with each entry as unique sample with its auido/word/phonetics paths

df['entry_id'] = df['path_from_data_dir'].str.split('.').str[0]

# group by entry_id to retain the corresponding 'test_or_train' and 'dialect_region' label
entry_labels = df.groupby('entry_id')[['test_or_train', 'dialect_region']].first()

# split based on dialect regions = group split by dialect and speaker as each speaker has only 1 dialect (from stats above)
train_entries = entry_labels[(entry_labels['test_or_train'] == "TRAIN") & (entry_labels['dialect_region'] != "DR2")].index.tolist()
valid_entries = entry_labels[(entry_labels['test_or_train'] == "TRAIN") & (entry_labels['dialect_region'] == "DR2")].index.tolist()
test_entries = entry_labels[entry_labels['test_or_train'] == "TEST"].index.tolist()

#  include entries that have all (audio, phonetic, and word) files
train = {key: data[key] for key in train_entries if len(data[key]) == 3}
valid = {key: data[key] for key in valid_entries if len(data[key]) == 3}
test  = {key: data[key] for key in test_entries if len(data[key]) == 3}  

print(f"Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")


Train: 1440, Valid: 240, Test: 1680


In [71]:
import json

dataset_splits = {
    "train": train,
    "valid": valid,
    "test": test
}

OUTPUT_PATH = 'TIMIT_DATA_SPLIT.json'
with open(OUTPUT_PATH, "w") as f:
    json.dump(dataset_splits, f)