In [None]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from tokenizers.processors import TemplateProcessing

import torch

# đọc file âm thanh
import soundfile as sf
import pandas as pd
import librosa

In [61]:
# Load the tokenizer and model
# model_dir = "vitouphy/wav2vec2-xls-r-300m-phoneme"
model_dir = "../model/kaggle/working/phoneme_recognition"
processor = Wav2Vec2Processor.from_pretrained(model_dir)
model = Wav2Vec2ForCTC.from_pretrained(model_dir)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [62]:
# lấy bảng mã âm vị từ file csv
vocab = processor.tokenizer.get_vocab()
vocab

{'aa': 1,
 'ae': 2,
 'ah': 3,
 'aw': 4,
 'ay': 5,
 'b': 6,
 'ch': 7,
 'd': 8,
 'dh': 9,
 'dx': 10,
 'eh': 11,
 'er': 12,
 'ey': 13,
 'f': 14,
 'g': 15,
 'h#': 16,
 'hh': 17,
 'ih': 18,
 'iy': 19,
 'jh': 20,
 'k': 21,
 'l': 22,
 'm': 23,
 'n': 24,
 'ng': 25,
 'ow': 26,
 'oy': 27,
 'p': 28,
 'r': 29,
 's': 30,
 'sh': 31,
 't': 32,
 'th': 33,
 'uh': 34,
 'uw': 35,
 'v': 36,
 'w': 37,
 'y': 38,
 'z': 39,
 '|': 0,
 '[UNK]': 40,
 '[PAD]': 41,
 '<s>': 42,
 '</s>': 43,
 '<unk>': 44,
 '<pad>': 45}

In [70]:
# Đọc file âm thanh (16kHz, mono)
audio_path = "../temp/output.wav"
speech, sample_rate = sf.read(audio_path)

if sample_rate != 16000:
    speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
    sample_rate = 16000

# Chuyển đổi âm thanh thành input features
inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt", padding=True)
print(inputs['input_values'])
print(pd.DataFrame(inputs))
input_values = inputs.input_values.to(device)

tensor([[-0.1185, -0.1185, -0.1185,  ..., -0.1185, -0.1185, -0.1185]])
                0
0    input_values
1  attention_mask


In [87]:
# Forward pass
with torch.no_grad():
    logits = model(input_values).logits

# Dự đoán ID token
predicted_ids = torch.argmax(logits, dim=-1)

# Decode ra text và tách thành mảng token
predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
predicted_text = predicted_text.replace("h#", "")
predicted_text = predicted_text.replace("[PAD]", "")
predicted_text = predicted_text.replace(" ", "")


# In ra kết quả
print("predicted_text:", predicted_text)


predicted_text: dawnlowdxihng


In [88]:
import pronouncing
import re

def sentence_to_phonemes(sentence):
    words = sentence.lower().split()
    phoneme_sequence = ""

    for word in words:
        phones = pronouncing.phones_for_word(word)
        if phones:
            # Lấy cách phát âm đầu tiên và tách âm vị
            phonemes = phones[0].split()
            # Loại bỏ số trọng âm (0, 1, 2)
            cleaned_phonemes = [re.sub(r'\d', '', p) for p in phonemes]
            phoneme_sequence  = phoneme_sequence.join(cleaned_phonemes)
        else:
            # Nếu không tìm thấy âm vị
            phoneme_sequence.append(f"[UNK:{word}]")

    return phoneme_sequence.lower()

# 🔎 Ví dụ sử dụng
sentence = "Downloading"
phonemes = sentence_to_phonemes(sentence)
print(phonemes)
print("Predicted Text:", predicted_text)


dawnlowdihng
Predicted Text: dawnlowdxihng


In [85]:
def compare_phonemes_verbose(target, predicted):
    # chuyển đổi các âm vị thành chữ thường
    target = [p.lower() for p in target]

    print(f"Target: {target}")
    print(f"Predicted: {predicted}")

    # Danh sách các cặp tương đương
    equivalent_pairs = {
        ("ih", "ah"), ("ah", "ih"),
        ("ih", "eh"), ("eh", "ih"),
        ("ih", "iy"), ("iy", "ih")
    }

    min_len = min(len(target), len(predicted))
    target = target[:min_len]
    predicted = predicted[:min_len]

    correct = 0
    total = min_len
    mistakes = []

    for i, (t, p) in enumerate(zip(target, predicted)):
        if t == p or (t, p) in equivalent_pairs:
            correct += 1
        else:
            mistakes.append((i, t, p))

    accuracy = correct / total if total > 0 else 0

    print(f"\n🔍 Tổng số: {total}, Đúng: {correct}, Sai: {len(mistakes)}")
    print(f"🎯 Accuracy: {accuracy * 100:.2f}%")

    if mistakes:
        print("\n❌ Vị trí sai:")
        for i, t, p in mistakes:
            print(f"  Vị trí {i}: expected '{t}', predicted '{p}'")

    return accuracy


In [None]:
from jiwer import cer
cer_value = cer(phonemes, predicted_text)
print(cer_value)  

0.08333333333333333


Target: ['d', 'a', 'w', 'n', 'l', 'o', 'w', 'd', 'i', 'h', 'n', 'g']
Predicted: ['dawn', 'lowdxihng']

🔍 Tổng số: 2, Đúng: 0, Sai: 2
🎯 Accuracy: 0.00%

❌ Vị trí sai:
  Vị trí 0: expected 'd', predicted 'dawn'
  Vị trí 1: expected 'a', predicted 'lowdxihng'


0.0