In [1]:
from tqdm.auto import tqdm

import os
import numpy as np
import pandas as pd

# Dataset

In [2]:
timit_path = '/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/'

In [3]:
data_path = '/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data/'

In [4]:
df = pd.concat(
    objs=[
        pd.read_csv(os.path.join(timit_path, 'train_data.csv'), index_col=0),
        pd.read_csv(os.path.join(timit_path, 'test_data.csv'), index_col=0)
    ]
)

df = df[
    (df['is_converted_audio'] == False) & \
    (df['is_phonetic_file'] == True)
    ].reset_index()

In [5]:
print(df.shape)
df.head()

(3360, 12)


Unnamed: 0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
0,2.0,TRAIN,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,TRAIN\\DR4\\MMDM0\\SI1311.PHN,False,False,False,True,False
1,4.0,TRAIN,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,TRAIN\\DR4\\MMDM0\\SX321.PHN,False,False,False,True,False
2,9.0,TRAIN,DR4,MMDM0,SX51.PHN,TRAIN/DR4/MMDM0/SX51.PHN,TRAIN\\DR4\\MMDM0\\SX51.PHN,False,False,False,True,False
3,13.0,TRAIN,DR4,MMDM0,SX231.PHN,TRAIN/DR4/MMDM0/SX231.PHN,TRAIN\\DR4\\MMDM0\\SX231.PHN,False,False,False,True,False
4,19.0,TRAIN,DR4,MMDM0,SX141.PHN,TRAIN/DR4/MMDM0/SX141.PHN,TRAIN\\DR4\\MMDM0\\SX141.PHN,False,False,False,True,False


In [6]:
df['test_or_train'].value_counts()

TRAIN    1680
TEST     1680
Name: test_or_train, dtype: int64

In [7]:
df.duplicated().sum()

0

# Test set Analysis

In [8]:
df_train = df[df['test_or_train'] == 'TRAIN'].copy()
df_test = df[df['test_or_train'] == 'TEST'].copy()

## Speakers Intersection

In [9]:
train_speakers = set(df_train['speaker_id'].unique())
test_speakers = set(df_test['speaker_id'].unique())

train_speakers & test_speakers

set()

## Dialects Intersection

In [10]:
train_dialects = set(df_train['dialect_region'].unique())
test_dialects = set(df_test['dialect_region'].unique())

print(f'train_dialects: {train_dialects}')
print(f'test_dialects: {test_dialects}')
print()
print(f'train_dialects & test_dialects: {train_dialects & test_dialects}')
print(f'train_dialects - test_dialects: {train_dialects - test_dialects}')

train_dialects: {'DR2', 'DR3', 'DR4'}
test_dialects: {'DR6', 'DR5', 'DR2', 'DR1', 'DR7', 'DR4', 'DR3', 'DR8'}

train_dialects & test_dialects: {'DR2', 'DR3', 'DR4'}
train_dialects - test_dialects: set()


# Train set speaker_id and dialect_region distribution

In [11]:
df_train['speaker_id'].value_counts().value_counts() # uniform

10    168
Name: speaker_id, dtype: int64

In [12]:
df_train['dialect_region'].value_counts(normalize=True)

DR3    0.452381
DR4    0.404762
DR2    0.142857
Name: dialect_region, dtype: float64

# Split Dataset

In [13]:
df_train = df[
    (df['test_or_train'] == 'TRAIN') & \
    (df['dialect_region'] != 'DR2')
    ].copy()

df_valid = df[
    (df['test_or_train'] == 'TRAIN') & \
    (df['dialect_region'] == 'DR2')
    ].copy()

df_test = df[df['test_or_train'] == 'TEST'].copy()


print(f"Train: {len(df_train)}, Valid: {len(df_valid)}, Test: {len(df_test)}")

Train: 1440, Valid: 240, Test: 1680


In [14]:
test_speakers = set(df_test['speaker_id'].unique())
train_speakers = set(df_train['speaker_id'].unique())

overlap = test_speakers & train_speakers  # Intersection of sets

if overlap:
    print("Overlap found:", overlap)
else:
    print("No overlap between test and train speaker IDs.")

No overlap between test and train speaker IDs.


In [15]:
test_speakers = set(df_test['dialect_region'].unique())
train_speakers = set(df_train['dialect_region'].unique())

overlap = test_speakers & train_speakers  # Intersection of sets

if overlap:
    print("Overlap found:", overlap)
else:
    print("No overlap between test and train dialect regions.")

Overlap found: {'DR3', 'DR4'}


^ Expected

In [16]:
df_train['split'] = 'train'
df_test['split'] = 'test'
df_valid['split'] = 'valid'

columns = ['index', 'split', 'dialect_region', 'speaker_id', 'filename', 'path_from_data_dir', 'path_from_data_dir_windows']

df_final = pd.concat([df_train, df_test, df_valid], ignore_index=True)[columns]

# Fix Split

In [17]:
df_final

Unnamed: 0,index,split,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows
0,2.0,train,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,TRAIN\\DR4\\MMDM0\\SI1311.PHN
1,4.0,train,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,TRAIN\\DR4\\MMDM0\\SX321.PHN
2,9.0,train,DR4,MMDM0,SX51.PHN,TRAIN/DR4/MMDM0/SX51.PHN,TRAIN\\DR4\\MMDM0\\SX51.PHN
3,13.0,train,DR4,MMDM0,SX231.PHN,TRAIN/DR4/MMDM0/SX231.PHN,TRAIN\\DR4\\MMDM0\\SX231.PHN
4,19.0,train,DR4,MMDM0,SX141.PHN,TRAIN/DR4/MMDM0/SX141.PHN,TRAIN\\DR4\\MMDM0\\SX141.PHN
...,...,...,...,...,...,...,...
3355,8377.0,valid,DR2,MMAA0,SA2.PHN,TRAIN/DR2/MMAA0/SA2.PHN,TRAIN\\DR2\\MMAA0\\SA2.PHN
3356,8378.0,valid,DR2,MMAA0,SI2105.PHN,TRAIN/DR2/MMAA0/SI2105.PHN,TRAIN\\DR2\\MMAA0\\SI2105.PHN
3357,8385.0,valid,DR2,MMAA0,SA1.PHN,TRAIN/DR2/MMAA0/SA1.PHN,TRAIN\\DR2\\MMAA0\\SA1.PHN
3358,8388.0,valid,DR2,MMAA0,SX215.PHN,TRAIN/DR2/MMAA0/SX215.PHN,TRAIN\\DR2\\MMAA0\\SX215.PHN


In [18]:
df_final.to_csv('darpa-timit-acousticphonetic-continuous-speech-cv.csv', index=False)