In [1]:
from wav2vec2 import Wav2Vec2Config, Wav2Vec2ForCTC
from transformers import Wav2Vec2ForCTC as HFWav2Vec2ForCTC

import tensorflow as tf
import torch
import numpy as np

def get_difference(tf_out, hf_out):
    return np.max(tf_out.numpy() - hf_out.numpy())

In [2]:
# np.random.seed(0)
# seqlen = 41000
# array = np.random.rand(1*seqlen).reshape(1, seqlen)
import soundfile as sf
def fetch_audio(f):
    audio, samplerate = sf.read(f)
    print({"sample_rate": samplerate})
    return audio[None, :41000]

array = fetch_audio("../data/sample.wav")

batch = tf.convert_to_tensor(array, dtype=tf.float32)
hf_batch = torch.from_numpy(array).float()

batch, hf_batch

{'sample_rate': 16000}


(<tf.Tensor: shape=(1, 41000), dtype=float32, numpy=
 array([[ 3.0517578e-05, -3.0517578e-05,  6.1035156e-05, ...,
          5.4931641e-04,  7.9345703e-04,  8.2397461e-04]], dtype=float32)>,
 tensor([[ 3.0518e-05, -3.0518e-05,  6.1035e-05,  ...,  5.4932e-04,
           7.9346e-04,  8.2397e-04]]))

In [3]:
tf_model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-base-960h", input_shape=batch.shape)

Loading weights locally from `wav2vec2-base-960h`
Total number of loaded variables: 212


In [4]:
hf_model = HFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tf_out = tf_model(batch, training=False)["logits"]

In [6]:
with torch.no_grad():
    hf_out = hf_model(hf_batch)["logits"]

In [7]:
tf_out.shape, hf_out.shape

(TensorShape([1, 127, 32]), torch.Size([1, 127, 32]))

In [8]:
print("difference in logits:", get_difference(tf_out, hf_out))

difference in logits: 0.0013465881


In [9]:
def tf_forward(*args, **kwargs):
    return tf_model(*args, **kwargs)
tf_forward = tf.function(tf_forward, autograph=True, jit_compile=True)

In [14]:
tf_out = tf_forward(batch, training=False)["logits"]

In [15]:
print("difference in graph based model logits:", get_difference(tf_out, hf_out))

difference in graph based model logits: 0.0013968945
