In [1]:
import numpy as np
import pandas as pd
import os
import librosa
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
#from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import tensorflow as tf

In [2]:
# The dataset used is the common voice dataset from Mozilla. It is readily available on Kaggle

train_X_path = "/kaggle/input/common-voice/cv-valid-train"
train_y_path = "/kaggle/input/common-voice/cv-valid-train.csv"

test_X_path = "/kaggle/input/common-voice/cv-valid-test"
test_y_path = "/kaggle/input/common-voice/cv-valid-test.csv"

In [3]:
X_train = []
y_train = []
X_val = []
y_val= []
X_test = []
y_test = []


df = pd.read_csv(train_y_path)
x=0
for index, row in df.iterrows():
    X_train.append(os.path.join(train_X_path, row['filename']))
    y_train.append(row['text'])
    x = x + 1
    if x == 3000:
        break

df = pd.read_csv(test_y_path)
x=0
for index, row in df.iterrows():
    X_test.append(os.path.join(test_X_path, row['filename']))
    y_test.append(row['text'])
    x = x + 1
    if x == 500:
        break

In [4]:
# Config
SAMPLE_RATE = 16000
N_MELS = 80
MAX_AUDIO_LEN = 4000
MAX_TEXT_LEN = 133

In [5]:
def preprocess_audio_librosa(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    # Pad or truncate spectrogram to fixed length
    if mel_spec_db.shape[1] < MAX_AUDIO_LEN: # Trim or Pad the Spectrogram to a fixed length
        mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, MAX_AUDIO_LEN - mel_spec_db.shape[1])))
    else:
        mel_spec_db = mel_spec_db[:, :MAX_AUDIO_LEN]
    return mel_spec_db.T  # Transpose to (time_steps, features)


# Tokenize and pad text
def preprocess_text(text, tokenizer, max_len=MAX_TEXT_LEN):
    sequence = tokenizer.texts_to_sequences([text])[0]
    padded_sequence = pad_sequences([sequence], maxlen=max_len, padding='post')[0]
    return padded_sequence


In [6]:
# Example preprocessing

audio_path = X_train[0]
text = y_train[0]

# Audio preprocessing
audio_features = preprocess_audio_librosa(audio_path)

# Text preprocessing
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([text])
text_sequence = preprocess_text(text, tokenizer)

print(f"Audio shape: {audio_features.shape}")
print(f"Text sequence: {len(text_sequence)}")

Audio shape: (4000, 80)
Text sequence: 133


In [7]:
df = []
df_test = []

tokenizer = Tokenizer(char_level=True)
for i in y_train:
    tokenizer.fit_on_texts(i)
for i in y_test:
    tokenizer.fit_on_texts(i)

for i in range(0,len(X_train)):
    df.append([np.asarray(preprocess_audio_librosa(X_train[i])).astype('float32'), preprocess_text(y_train[i], tokenizer)])

for i in range(0,len(X_test)):
    df_test.append([np.asarray(preprocess_audio_librosa(X_test[i])).astype('float32'), preprocess_text(y_test[i], tokenizer)])

In [8]:
df = pd.DataFrame(df, columns = ['speech', 'text'])

In [9]:
# Extract data
X = np.stack(df['speech'].values).astype(np.float32)  # Shape: (batch_size, 4000, 80)
y = np.stack(df['text'].values).astype(np.int32)      # Shape: (batch_size, 133)

# Expand dimensions for target
y = np.expand_dims(y, axis=-1)                       # Shape: (batch_size, 133, 1)

# Create decoder input (shifted target)
decoder_input = np.zeros_like(y)                     # Same shape as target
decoder_input[:, 1:, :] = y[:, :-1, :]                # Shift right by 1

print(f"Audio shape: {X.shape}")
print(f"Decoder input shape: {decoder_input.shape}")
print(f"Text (target) shape: {y.shape}")

# Split data
X_train, X_test = X[:2500], X[2500:]
decoder_train, decoder_test = decoder_input[:2500], decoder_input[2500:]
y_train, y_test = y[:2500], y[2500:]

Audio shape: (3000, 4000, 80)
Decoder input shape: (3000, 133, 1)
Text (target) shape: (3000, 133, 1)


In [10]:
print(f"X_train type: {type(X_train)}, shape: {X_train.shape}")
print(f"decoder_train type: {type(decoder_train)}, shape: {decoder_train.shape}")
print(f"y_train type: {type(y_train)}, shape: {y_train.shape}")

X_train type: <class 'numpy.ndarray'>, shape: (2500, 4000, 80)
decoder_train type: <class 'numpy.ndarray'>, shape: (2500, 133, 1)
y_train type: <class 'numpy.ndarray'>, shape: (2500, 133, 1)


In [11]:
VOCAB_SIZE = 29   # Number of unique tokens in text
MAX_AUDIO_LENGTH = 4000
MFCC_FEATURES = 80
MAX_TEXT_LENGTH = 133

# Simulated DataFrame data
X = np.stack(df['speech'].values).astype(np.float32)  # (batch_size, 4000, 80)
y = np.stack(df['text'].values).astype(np.int32)      # (batch_size, 133)

# Expand dims for target
y = np.expand_dims(y, axis=-1)                       # (batch_size, 133, 1)

# Decoder input - shifted target
decoder_input = np.zeros_like(y)
decoder_input[:, 1:, :] = y[:, :-1, :]

decoder_train = np.clip(decoder_train, 0, VOCAB_SIZE - 1)
decoder_test = np.clip(decoder_test, 0, VOCAB_SIZE - 1)

# One-hot encode decoder input
decoder_input = to_categorical(decoder_input, num_classes=VOCAB_SIZE)

# Split data
#X_train, X_test = X[:2500], X[2500:]
#decoder_train, decoder_test = decoder_input[:2500], decoder_input[2500:]
#y_train, y_test = y[:2500], y[2500:]

# Encoder
encoder_inputs = Input(shape=(MAX_AUDIO_LENGTH, MFCC_FEATURES))
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

# Discard encoder outputs, only keep states
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(MAX_TEXT_LENGTH, VOCAB_SIZE))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# Dense layer for final output
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [12]:
model.summary()

In [13]:
# Train model
history = model.fit(
    [X_train, decoder_train], y_train,
    validation_data=([X_test, decoder_test], y_test),
    epochs=100,
    batch_size=32
)

print("Training complete!")

Epoch 1/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 260ms/step - accuracy: 0.6289 - loss: 1.9298 - val_accuracy: 0.6875 - val_loss: 1.0953
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 235ms/step - accuracy: 0.6920 - loss: 1.0797 - val_accuracy: 0.6983 - val_loss: 1.0563
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 234ms/step - accuracy: 0.7019 - loss: 1.0472 - val_accuracy: 0.7083 - val_loss: 1.0225
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 234ms/step - accuracy: 0.7140 - loss: 1.0031 - val_accuracy: 0.7146 - val_loss: 1.0000
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 234ms/step - accuracy: 0.7126 - loss: 1.0035 - val_accuracy: 0.7165 - val_loss: 0.9848
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 235ms/step - accuracy: 0.7192 - loss: 0.9803 - val_accuracy: 0.7215 - val_loss: 0.9684
Epoch 7/100
[1m