In [1]:
from transformers import Wav2Vec2Processor, Data2VecAudioModel
import torch
from datasets import load_dataset

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-base-960h")
model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base-960h")

# audio file is decoded on the fly
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset librispeech_asr_demo (/home/valeriopuglisi/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)
Some weights of the model checkpoint at facebook/data2vec-audio-base-960h were not used when initializing Data2VecAudioModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Data2VecAudioModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecAudioModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1, 292, 768]

In [2]:
last_hidden_states.shape
print(last_hidden_states)



tensor([[[-0.9335,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         [-0.9335,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         [-0.9334,  0.3089, -0.6611,  ..., -0.1616,  0.2516,  0.3117],
         ...,
         [-1.0996, -0.1713, -0.2078,  ..., -0.2466, -0.0577, -0.0303],
         [-1.1000, -0.1701, -0.2100,  ..., -0.2475, -0.0610, -0.0314],
         [-1.1015, -0.1651, -0.2113,  ..., -0.2483, -0.0623, -0.0311]]])


In [3]:
from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForAudioFrameClassification
from datasets import load_dataset
import torch

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-data2vec-audio-frame")
model = Data2VecAudioForAudioFrameClassification.from_pretrained("hf-internal-testing/tiny-random-data2vec-audio-frame")

# audio file is decoded on the fly
inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=sampling_rate)
with torch.no_grad():
    logits = model(**inputs).logits

probabilities = torch.sigmoid(logits[0])
# labels is a one-hot array of shape (num_frames, num_speakers)
labels = (probabilities > 0.5).long()
labels[0].tolist()

Reusing dataset librispeech_asr_demo (/home/valeriopuglisi/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


[1, 1]

# DATA2VEC FEATURE EXTRACTION

In [2]:
from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForXVector
from datasets import load_dataset
import torch

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-data2vec-xvector")
model = Data2VecAudioForXVector.from_pretrained("hf-internal-testing/tiny-random-data2vec-xvector")

# audio file is decoded on the fly
audio_files = [d["array"] for d in dataset[:2]["audio"]]


inputs = feature_extractor(audio_files, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
with torch.no_grad():
    embeddings = model(**inputs).embeddings

embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()

# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings[0], embeddings[1])
threshold = 0.7  # the optimal threshold is dataset-dependent
if similarity < threshold:
    print("Speakers are not the same!")
else:
    print("Speakers are the same!")
round(similarity.item(), 2)

Reusing dataset librispeech_asr_demo (/home/valeriopuglisi/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


Speakers are the same!


1.0

In [2]:
from transformers import Wav2Vec2FeatureExtractor, Data2VecAudioForXVector
from datasets import load_dataset
import torch
import torchaudio
import torchaudio.functional as TAF


dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate


source_1_waveform_path = "/storage/data_itcoin/svoice_space/DeepLearningAudioAnalyzer/audio-be/common_voice_it_17415780.wav"
source_2_waveform_path= "/storage/data_itcoin/svoice_space/DeepLearningAudioAnalyzer/audio-be/common_voice_en_1047.wav"
source_1_waveform, sample_rate_1 = torchaudio.load(source_1_waveform_path)
source_2_waveform, sample_rate_2 = torchaudio.load(source_2_waveform_path)
resample_rate = 16000
source_1_waveform = TAF.resample(source_1_waveform, sample_rate_1, resample_rate)
source_2_waveform = TAF.resample(source_2_waveform, sample_rate_1, resample_rate)
audio_files=[source_1_waveform, source_2_waveform]
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-data2vec-xvector")
model = Data2VecAudioForXVector.from_pretrained("hf-internal-testing/tiny-random-data2vec-xvector")
inputs = feature_extractor(audio_files, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

with torch.no_grad():
    embeddings = model(**inputs).embeddings
embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()

# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings[0], embeddings[1])
threshold = 0.7  # the optimal threshold is dataset-dependent

if similarity < threshold:
    print("Speakers are not the same!")
else:
    print("Speakers are the same!")
round(similarity.item(), 2)

Reusing dataset librispeech_asr_demo (/home/valeriopuglisi/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)
  return np.array(obj)
  return np.array(obj)


ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.