In [26]:
import numpy as np
import random
import pickle

from sklearn.preprocessing import StandardScaler

In [6]:
from lab3_proto import words2phones
from lab3_tools import path2info
from prondict import prondict

In [7]:
all_train_data = np.load('data/traindata.npz', allow_pickle=True)['traindata']

In [8]:
test_data = np.load('data/testdata.npz', allow_pickle=True)['testdata']

In [9]:
len(all_train_data)

8623

In [14]:
len(test_data)

8700

In [10]:
with open('stateList.pkl', 'rb') as file:
    stateList = pickle.load(file)

In [11]:
all_train_data[0].keys()
all_train_data[0]['filename']

'/home/tim/School/Speech_and_speaker_recognition/labs/tidigits/disc_4.1.1/tidigits/train/woman/es/o9275a.wav'

### Inspect some samples

In [12]:
for _ in range(3):  # Show 3 random samples
    sample = random.choice(all_train_data)
    filename = sample['filename']
    target_idxs = sample['targets']

    # Decode target indices into state labels
    decoded_states = [stateList[i] for i in target_idxs]

    # Retrieve word and phone transcriptions
    wordTrans = list(path2info(filename)[2])
    phoneTrans = words2phones(wordTrans, prondict, addSilence=True, addShortPause=True)

    print(f"\nFilename: {filename}")
    print(f"Words   : {wordTrans}")
    print(f"Phones  : {phoneTrans}")
    print(f"States  : {decoded_states}")  # Show start and end
    print(f"#States : {len(decoded_states)}")


Filename: /home/tim/School/Speech_and_speaker_recognition/labs/tidigits/disc_4.1.1/tidigits/train/man/ne/7o78523a.wav
Words   : ['7', 'o', '7', '8', '5', '2', '3']
Phones  : ['sil', 's', 'eh', 'v', 'ah', 'n', 'sp', 'ow', 'sp', 's', 'eh', 'v', 'ah', 'n', 'sp', 'ey', 't', 'sp', 'f', 'ay', 'v', 'sp', 't', 'uw', 'sp', 'th', 'r', 'iy', 'sp', 'sil']
States  : ['sil_0', 'sil_0', 'sil_0', 'sil_0', 'sil_0', 'sil_0', 'sil_0', 'sil_0', 'sil_0', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_1', 'sil_2', 'sil_2', 's_0', 's_0', 's_0', 's_1', 's_2', 'eh_0', 'eh_0', 'eh_0', 'eh_0', 'eh_0', 'eh_0', 'eh_0', 'eh_1', 'eh_2', 'eh_2', 'eh_2', 'eh_2', 'eh_2', 'eh_2', 'v_0', 'v_1', 'v_1', 'v_2', 'ah_0', 'ah_0', 'ah_0', 'ah_0', 'ah_1', 'ah_1', 'ah_2', 'n_0', 'n_1', 'n_2', 'ow_0', 'ow_0', 'ow_0', 'ow_0', 'ow_0', 'ow_1', 'ow_1', 'ow_1', 'ow_1', 'ow_1', 'ow_1', 'ow_1', 'ow_2', 's_0', 's_0', 's_0', 's_0', 's_1', 's_2', 'eh_0', 'eh_0', 'eh_0', 'eh_0', 'eh_0', 'eh_0'

### 4.4 Training and Validation Sets

In [13]:
import random
from collections import defaultdict

# Seed for reproducibility
random.seed(42)

# Build a mapping: speaker_id -> {'gender': ..., 'samples': [...]}
speaker_data = defaultdict(lambda: {'gender': None, 'samples': []})

for item in all_train_data:
    path_parts = item['filename'].split('/')
    gender = path_parts[-3]   # 'man' or 'woman'
    speaker_id = path_parts[-2]  # e.g., 'es', 'gp' (speaker ID)

    speaker_data[speaker_id]['gender'] = gender
    speaker_data[speaker_id]['samples'].append(item)

# Separate speaker IDs by gender
male_speakers = [s for s, data in speaker_data.items() if data['gender'] == 'man']
female_speakers = [s for s, data in speaker_data.items() if data['gender'] == 'woman']

# Shuffle
random.shuffle(male_speakers)
random.shuffle(female_speakers)

# Compute split points
male_split = int(len(male_speakers) * 0.9)
female_split = int(len(female_speakers) * 0.9)

# Assign speakers to sets
train_speakers = male_speakers[:male_split] + female_speakers[:female_split]
val_speakers = male_speakers[male_split:] + female_speakers[female_split:]

# Create data sets
train_set = [item for s in train_speakers for item in speaker_data[s]['samples']]
val_set = [item for s in val_speakers for item in speaker_data[s]['samples']]

# Report
def report(dataset, name):
    men = sum(1 for x in dataset if '/man/' in x['filename'])
    women = sum(1 for x in dataset if '/woman/' in x['filename'])
    print(f"{name}: {len(dataset)} samples — Men: {men}, Women: {women}")

report(train_set, "Train set")
report(val_set, "Validation set")


Train set: 7699 samples — Men: 3773, Women: 3926
Validation set: 924 samples — Men: 462, Women: 462


### 4.5 Acoustic Context (Dynamic Features)

In [20]:
def stack_context_features(feats, context_size=3):
    """
    Stacks context_size*2 + 1 frames around each frame, using mirroring at boundaries.

    Args:
        feats: NxD array of feature vectors (N time steps, D features)
        context_size: Number of frames to include before and after the current frame

    Returns:
        Nx(D*(2*context_size+1)) array of stacked features
    """
    N, D = feats.shape
    padded = np.pad(feats, ((context_size, context_size), (0, 0)), mode='reflect')
    stacked = np.zeros((N, D * (2 * context_size + 1)))

    for i in range(N):
        context_window = padded[i:i + 2 * context_size + 1].reshape(-1)
        stacked[i] = context_window

    return stacked

def add_dynamic_features(data):
    for item in data:
        item['lmfcc_dynamic'] = stack_context_features(item['lmfcc'], context_size=3)
        item['mspec_dynamic'] = stack_context_features(item['mspec'], context_size=3)

### 4.6 Feature Standardization

In [22]:
def flatten_dataset(dataset, feature_key='lmfcc'):
    """
    Flattens a list of utterance-level dictionaries into a single feature matrix and target vector.

    Returns:
        - X: N × D feature matrix (e.g. stacked MFCCs or log-mel)
        - y: N-dimensional array of state indices
    """
    X_list = []
    y_list = []

    for utterance in dataset:
        X_list.append(utterance['lmfcc'])   # or 'stacked_features' if you've applied 4.5
        y_list.append(utterance['targets'])

    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    return X, y

In [23]:
X_train, y_train = flatten_dataset(train_set)
X_val, y_val = flatten_dataset(val_set)
X_test, y_test = flatten_dataset(test_data)

In [24]:
print(f"Train set shape: {X_train.shape}, Targets shape: {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, Targets shape: {y_val.shape}")
print(f"Test set shape: {X_test.shape}, Targets shape: {y_test.shape}")

Train set shape: (1340795, 13), Targets shape: (1340795,)
Validation set shape: (166597, 13), Targets shape: (166597,)
Test set shape: (1527014, 13), Targets shape: (1527014,)


In [27]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [28]:
# save dataset to file
np.savez('data/preprocessed/lmfcc_standard.npz', X_train=X_train_scaled, y_train=y_train,
         X_val=X_val_scaled, y_val=y_val, X_test=X_test_scaled, y_test=y_test)