In [1]:
import torch
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor
from scipy.spatial.distance import cosine
import librosa
from dataclasses import dataclass
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.get_device_capability())


True
NVIDIA GeForce RTX 5080
(12, 0)


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [4]:
def load_audio(path, sr=16000):
    y, _ = librosa.load(path, sr=sr, mono=True)
    # chuẩn hóa biên độ để ổn định so sánh
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    return y

In [None]:
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")

ref_audio_path = "./audio_files/word_february.mp3"

ref_audio = load_audio(ref_audio_path)

# tokenize
input_values = processor(ref_audio, return_tensors="pt").input_values

# retrieve logits
with torch.no_grad():
    logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
# => should give ['m ɪ s t ɚ k w ɪ l t ɚ ɹ ɪ z ð ɪ ɐ p ɑː s əl ʌ v ð ə m ɪ d əl k l æ s ᵻ z æ n d w iː ɑːɹ ɡ l æ d t ə w ɛ l k ə m h ɪ z ɡ ɑː s p əl']


TypeError: Received a bool for argument tokenizer, but a PreTrainedTokenizerBase was expected.

In [None]:
# def get_phoneme_embeddings(audio: np.ndarray) -> Tuple[torch.Tensor, str]:
#     """Extract phoneme-level embeddings and predicted phonemes from audio."""
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
#     processor = Wav2Vec2Processor.from_pretrained("bookbot/wav2vec2-ljspeech-gruut")
#     model = Wav2Vec2ForCTC.from_pretrained("bookbot/wav2vec2-ljspeech-gruut")
#     model.to(device)  # Di chuyển model lên GPU
#     model.eval()
    
#     # Prepare input và di chuyển lên GPU
#     inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
#     inputs = {k: v.to(device) for k, v in inputs.items()}  # Chuyển tất cả inputs lên GPU
    
#     with torch.inference_mode():  # Tối ưu hơn no_grad cho inference
#         # Get hidden states (embeddings)
#         outputs = model(**inputs, output_hidden_states=True)
#         hidden_states = outputs.hidden_states[-1]  # Last layer, vẫn trên GPU
        
#         # Get phoneme predictions
#         logits = outputs.logits
#         predicted_ids = torch.argmax(logits, dim=-1)
#         # predicted_ids_cpu = predicted_ids.cpu()  # Chuyển về CPU cho decode
#         # phonemes = processor.batch_decode(predicted_ids_cpu)[0]

#     return hidden_states.squeeze(0)

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# processor = Wav2Vec2Processor.from_pretrained("bookbot/wav2vec2-ljspeech-gruut")
# model = Wav2Vec2ForCTC.from_pretrained("bookbot/wav2vec2-ljspeech-gruut").to(device).eval()

# def get_phoneme_embeddings_fast(audio: np.ndarray):
#     inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=False)
#     input_values = inputs.input_values.to(device, non_blocking=True)
#     attention_mask = inputs.get("attention_mask")
#     if attention_mask is not None:
#         attention_mask = attention_mask.to(device, non_blocking=True)

#     with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=torch.cuda.is_available()), \
#          torch.inference_mode():
#         out = model(input_values, attention_mask=attention_mask)  # không xin hidden_states
#         last = out.last_hidden_state.squeeze(0)                   # [T, H]
#         ids = torch.argmax(out.logits, dim=-1).cpu().numpy()      # decode trên CPU
#     phonemes = processor.batch_decode(ids)[0]
#     return last, phonemes
