In [1]:
import os
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch.nn.functional as F

try:
    from fastdtw import fastdtw
    from scipy.spatial.distance import cosine as cosine_dist
    USE_DTW = True
except Exception:
    USE_DTW = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# compare_wav2vec_similarity.py

MODEL_NAME = "facebook/wav2vec2-base-960h"   # hoặc "facebook/wav2vec2-base"
TARGET_SR = 16000

def load_audio(path, sr=TARGET_SR):
    y, _ = librosa.load(path, sr=sr, mono=True)
    # chuẩn hóa biên độ để ổn định so sánh
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    return y

@torch.no_grad()
def wav2vec2_embeddings(waveform, processor, model, device="cpu"):
    """
    Trả về:
      - emb_clip: embedding clip-level (mean-pool trên thời gian) [hidden_size]
      - emb_frames: embedding theo frame [T, hidden_size]
    """
    inputs = processor(waveform, sampling_rate=TARGET_SR, return_tensors="pt", padding=False)
    input_values = inputs.input_values.to(device)
    out = model(input_values)
    # last_hidden_state: [B, T, H] -> [T, H]
    feats = out.last_hidden_state.squeeze(0).cpu()  # [T, H]
    emb_clip = feats.mean(dim=0)                    # [H]
    return emb_clip, feats

def cosine_sim(a, b):
    a = F.normalize(a.unsqueeze(0), dim=1)
    b = F.normalize(b.unsqueeze(0), dim=1)
    return float((a @ b.T).item())  # in [-1, 1]

def dtw_frame_similarity(F1, F2):
    """
    So sánh theo khung thời gian bằng DTW để bù nhanh/chậm.
    Trả về khoảng cách DTW (nhỏ hơn là giống hơn).
    """
    if not USE_DTW:
        return None
    seq1 = F1.numpy()
    seq2 = F2.numpy()
    # khoảng cách dùng 1 - cosine_similarity ~ cosine distance
    dist, _ = fastdtw(seq1, seq2, dist=cosine_dist)
    return float(dist)

def cal_similarity(f1, f2, model_name=MODEL_NAME, device=None):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    print("Loading model...")
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2Model.from_pretrained(model_name).to(device).eval()

    # Read audio
    y1 = load_audio(f1)
    y2 = load_audio(f2)
    print(f"[1] {os.path.basename(f1)}: {len(y1)/TARGET_SR:.2f}s")
    print(f"[2] {os.path.basename(f2)}: {len(y2)/TARGET_SR:.2f}s")

    # Embeddings
    emb1, F1 = wav2vec2_embeddings(y1, processor, model, device)
    emb2, F2 = wav2vec2_embeddings(y2, processor, model, device)

    # Clip-level cosine similarity
    sim = cosine_sim(emb1, emb2)
    print(f"Clip cosine similarity (−1..1): {sim:.4f}")

    # Optional: DTW distance (frame-level)
    dtw_dist = dtw_frame_similarity(F1, F2)
    if dtw_dist is not None:
        print(f"Frame-level DTW cosine distance (lower = more similar): {dtw_dist:.2f}")
    else:
        print("Install fastdtw & scipy for DTW: pip install fastdtw scipy")

In [4]:
f1 = './audio_01.mp3'  # Thay đổi đường dẫn file của bạn
f2 = './audio_04.mp3'  # Thay đổi đường dẫn file của
print(cal_similarity(f1, f2, model_name=MODEL_NAME))

Device: cpu
Loading model...


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1] audio_01.mp3: 3.14s
[2] audio_04.mp3: 3.12s
Clip cosine similarity (−1..1): 0.9842
Frame-level DTW cosine distance (lower = more similar): 13.58
None
