In [None]:
import tensorflow as tf
import librosa
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv2D, MaxPooling2D, Flatten

#  2. Prepare data

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tommyngx/fluent-speech-corpus")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tommyngx/fluent-speech-corpus?dataset_version_number=1...


100%|██████████| 1.44G/1.44G [00:08<00:00, 177MB/s]


Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/tommyngx/fluent-speech-corpus/versions/1


In [None]:
import shutil
import os

data_path = '/root/.cache/kagglehub/datasets/tommyngx/fluent-speech-corpus/versions/1'
content_dir = '/content'

shutil.copytree(data_path, content_dir, dirs_exist_ok=True)
print(f"Đã sao chép toàn bộ bộ dữ liệu vào thư mục {content_dir}")

Đã sao chép toàn bộ bộ dữ liệu vào thư mục /content


## **2.1 Data preprocessing**

In [None]:
import os
from glob import glob
import re
import keras
from keras import layers


In [None]:
import pandas as pd
base_path = "/content/fluent_speech_commands_dataset"

train_df = pd.read_csv(f"{base_path}/data/train_data.csv")
valid_df = pd.read_csv(f"{base_path}/data/valid_data.csv")
test_df = pd.read_csv(f"{base_path}/data/test_data.csv")

# Loại bỏ khoảng trắng ở tên cột 
for df in [train_df, valid_df, test_df]:
    df.columns = df.columns.str.strip()

train_df = train_df.rename(columns={"path": "audio", "transcription": "text"})[["audio", "text"]]
valid_df = valid_df.rename(columns={"path": "audio", "transcription": "text"})[["audio", "text"]]
test_df = test_df.rename(columns={"path": "audio", "transcription": "text"})[["audio", "text"]]

# Cập nhật đường dẫn tuyệt đối cho file âm thanh
for df in [train_df, valid_df, test_df]:
    df["audio"] = df["audio"].apply(lambda x: f"{base_path}/{x}")

train_df = train_df.rename(columns={"audio": "full_path", "text": "transcription"})
valid_df = valid_df.rename(columns={"audio": "full_path", "text": "transcription"})
test_df = test_df.rename(columns={"audio": "full_path", "text": "transcription"})

In [None]:
# Print mẫu
print(" Train samples:")
print(train_df[["full_path", "transcription"]].sample(5))

print("\n Validation samples:")
print(valid_df[["full_path", "transcription"]].sample(5))

print("\n Test samples:")
print(test_df[["full_path", "transcription"]].sample(5))

 Train samples:
                                               full_path  \
9754   /content/fluent_speech_commands_dataset/wavs/s...   
6235   /content/fluent_speech_commands_dataset/wavs/s...   
6250   /content/fluent_speech_commands_dataset/wavs/s...   
17787  /content/fluent_speech_commands_dataset/wavs/s...   
5890   /content/fluent_speech_commands_dataset/wavs/s...   

                                  transcription  
9754   Decrease the temperature in the bathroom  
6235                           Put on the music  
6250        Switch on the lights in the bedroom  
17787                 Turn the temperature down  
5890   Increase the temperature in the washroom  

 Validation samples:
                                              full_path  \
595   /content/fluent_speech_commands_dataset/wavs/s...   
684   /content/fluent_speech_commands_dataset/wavs/s...   
1440  /content/fluent_speech_commands_dataset/wavs/s...   
1453  /content/fluent_speech_commands_dataset/wavs/s...   
2694  

In [None]:
characters = sorted(list(set("".join(train_df["transcription"].values))))
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="", mask_token=None)
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", mask_token=None, invert=True)


In [None]:
frame_length = 100
frame_step = 50
fft_length = 256

def encode_single_sample(wav_file, label):
    file = tf.io.read_file(wav_file)
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio = tf.cast(audio, tf.float32)

    # Tính số frame sau STFT
    stft = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    spectrogram = tf.abs(stft)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)

    return spectrogram, label, tf.shape(spectrogram)[0]


In [None]:
def filter_empty_samples(spectrogram, label, time_steps):
    return time_steps > 0

In [None]:
import tensorflow as tf
batch_size = 16

def create_dataset(df):
    dataset = tf.data.Dataset.from_tensor_slices((list(df["full_path"]), list(df["transcription"])))
    dataset = (
        dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
        .filter(filter_empty_samples)
        .map(lambda spec, label, _: (spec, label))  # Bỏ time_steps sau khi lọc
        .padded_batch(
            batch_size,
            padded_shapes=([None, None], [None]),
            padding_values=(tf.constant(0.0, dtype=tf.float32), tf.constant(-1, dtype=tf.int64))
        )
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    )
    return dataset

In [None]:
train_dataset = create_dataset(train_df)
val_dataset = create_dataset(valid_df)
test_dataset = create_dataset(test_df)


In [None]:
# Xem dữ liệu mẫu
for i, (x, y) in enumerate(train_dataset.take(1)):
    print(f"Batch {i}: x shape = {x.shape}, y shape = {y.shape}")
    for j in range(3):
        text = tf.strings.reduce_join(num_to_char(y[j])).numpy().decode("utf-8")
        print(f"Sample {j+1} - Spectrogram shape: {x[j].shape} - Label: {text}")

Batch 0: x shape = (16, 771, 129), y shape = (16, 25)
Sample 1 - Spectrogram shape: (771, 129) - Label: Change language
Sample 2 - Spectrogram shape: (771, 129) - Label: Resume
Sample 3 - Spectrogram shape: (771, 129) - Label: Turn the lights on


In [None]:
for spectrograms, labels in train_dataset.take(1):
    print("Train dataset example:")
    for i in range(3):
        text = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8")
        print(f"Sample {i+1} - Spectrogram shape: {spectrograms[i].shape} - Label: {text}")


Train dataset example:
Sample 1 - Spectrogram shape: (771, 129) - Label: Change language
Sample 2 - Spectrogram shape: (771, 129) - Label: Resume
Sample 3 - Spectrogram shape: (771, 129) - Label: Turn the lights on


In [None]:
def CTCLoss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int64)
    # thay padding -1 bằng dummy label 0
    y_true = tf.where(y_true == -1, tf.constant(0, dtype=tf.int64), y_true)

    batch_size = tf.shape(y_pred)[0]
    time_steps = tf.shape(y_pred)[1]

    # Input length : all time steps
    input_length = tf.fill([batch_size, 1], time_steps)

    # Label length: số lượng non-padding ở mỗi sequence
    label_length = tf.math.reduce_sum(tf.cast(y_true != 0, tf.int64), axis=1, keepdims=True)

    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)


**3. Data Set and Data Loader Definition**

In [None]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    input_spectrogram = layers.Input((None, input_dim), name="source")
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    x = layers.Conv2D(32, [11, 31], [2, 2], padding="same", use_bias=False, name="conv_1")(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    x = layers.Conv2D(32, [11, 21], [1, 2], padding="same", use_bias=False, name="conv_2")(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(rnn_units, return_sequences=True, reset_after=True, name=f"gru_{i}", implementation=2)
        x = layers.Bidirectional(recurrent, name=f"bidirectional_{i}")(x)
        if i < rnn_layers:
            x = layers.Dropout(0.5)(x)
    x = layers.Dense(rnn_units * 2, name="dense_intermediate")(x)
    x = layers.ReLU(name="dense_relu")(x)
    x = layers.Dropout(0.5)(x)

    # thêm một lớp cho blank token
    vocab_size = len(char_to_num.get_vocabulary()) + 1
    output = layers.Dense(units=vocab_size, activation="softmax", name="output")(x)

    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
    model.compile(optimizer=keras.optimizers.Adam(1e-4), loss=CTCLoss)
    return model


DataLoader object

In [None]:
model = build_model(input_dim=129, output_dim=len(char_to_num.get_vocabulary()), rnn_units=256)
model.summary(line_length=110)

In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [None]:
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # greedy search
    results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # loop ở đây để lấy text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text


In [None]:
from jiwer import wer
class CallbackEval(keras.callbacks.Callback):
    def __init__(self, dataset, max_examples=5):
        super().__init__()
        self.dataset = dataset
        self.max_examples = max_examples
        self.best_wer = float("inf")  # Track best WER
        self.best_weights = None

    def on_epoch_end(self, epoch: int, logs=None):
        predictions = []
        targets = []

        for batch in self.dataset:
            X, y = batch
            batch_pred = self.model.predict(X, verbose=0)
            batch_text = decode_batch_predictions(batch_pred)
            predictions.extend(batch_text)

            label_texts = tf.strings.reduce_join(num_to_char(y), axis=-1)
            label_texts = label_texts.numpy()
            label_texts = [t.decode("utf-8", errors="ignore").strip() for t in label_texts]
            targets.extend(label_texts)

        wer_score = wer(targets, predictions)

        # Gán log hên xui khi cần xài ở ModelCheckpoint
        if logs is not None:
            logs["wer"] = wer_score

        print("=" * 100)
        print(f"Epoch {epoch + 1} - WER: {wer_score:.4f} - Accuracy: {1 - wer_score:.4f}")
        print("-" * 100)

        # Save best model theo WER
        if wer_score < self.best_wer:
            self.best_wer = wer_score
            self.best_weights = self.model.get_weights()
            print(f"Saved best model (WER: {wer_score:.4f})")

        # In vài vd
        num_samples = min(len(predictions), self.max_examples)
        example_indices = np.random.choice(len(predictions), num_samples, replace=False)
        for i in example_indices:
            print(f"Target    : {targets[i]}")
            print(f"Prediction: {predictions[i]}")
            print("-" * 100)

    def on_train_end(self, logs=None):
        if self.best_weights is not None:
            self.model.set_weights(self.best_weights)
            self.model.save_weights("best_model.weights.h5")


In [None]:
for batch in train_dataset.take(5):
    x, y = batch
    print(f"Spectrogram shape: {x.shape}, Label shape: {y.shape}")


Spectrogram shape: (16, 771, 129), Label shape: (16, 25)
Spectrogram shape: (16, 1009, 129), Label shape: (16, 35)
Spectrogram shape: (16, 979, 129), Label shape: (16, 36)
Spectrogram shape: (16, 1247, 129), Label shape: (16, 40)
Spectrogram shape: (16, 1574, 129), Label shape: (16, 47)


#  Training

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=50,
    callbacks=[
        CallbackEval(val_dataset),
    ],
)

Epoch 1/50


KeyboardInterrupt: 

# Save model

In [None]:
model.save_weights("model_last_epoch.weights.h5")

In [None]:
model.save("full_last_epoch_model.h5")

In [None]:
model.load_weights("/content/best_model.weights.h5")

  saveable.load_own_variables(weights_store.get(inner_path))


# Inference và predict

In [None]:
frame_length = 100
frame_step = 50
fft_length = 256

def predict_single_wav1(file_path):
    target_sample_rate = 16000
    audio, sr = librosa.load(file_path, sr=target_sample_rate)  # resample về 16kHz

    # Chuyển về tensor và tiền xử lý
    audio = tf.convert_to_tensor(audio, dtype=tf.float32)

    # Tạo spectrogram như lúc train
    stft = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    spectrogram = tf.abs(stft)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    # Thêm batch dimension để đưa vào model
    spectrogram = tf.expand_dims(spectrogram, axis=0)

    # predict
    prediction = model.predict(spectrogram)
    decoded = decode_batch_predictions(prediction)

    return decoded[0]


In [None]:
import zipfile
zif_ref = zipfile.ZipFile('/content/wav.zip')
zif_ref.extractall('/content')
zif_ref.close()

In [None]:
import glob

def predict_multiple_wavs(file_paths):
    results = []
    for file_path in file_paths:
        prediction = predict_single_wav1(file_path)
        results.append((file_path, prediction))
    return results

# lấy hết file wav
file_paths = glob.glob("/content/wav/*.wav")

predictions = predict_multiple_wavs(file_paths)

for file_path, transcript in predictions:
    print(f"{file_path} → {transcript}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 916ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m