In [1]:
import sys
sys.path.append("../src")

In [2]:
from wav2vec2 import Wav2Vec2Config, Wav2Vec2ForCTC, Wav2Vec2Processer
from transformers import Wav2Vec2ForCTC as HFWav2Vec2ForCTC

import tensorflow as tf
import torch
import numpy as np

def get_difference(tf_out, hf_out):
    return np.max(tf_out.numpy() - hf_out.numpy())

In [3]:
batch, _ = tf.audio.decode_wav(tf.io.read_file("../data/sample.wav"))
processor = Wav2Vec2Processer(is_tokenizer=False)
batch = tf.transpose(batch, perm=(1, 0))

batch = tf.concat([batch, batch], axis=0)

batch = processor(batch)
hf_batch = torch.from_numpy(batch.numpy()).float()

batch, hf_batch

(<tf.Tensor: shape=(2, 46797), dtype=float32, numpy=
 array([[ 0.00455413, -0.00263517,  0.00814878, ..., -0.00263517,
         -0.01701376, -0.02779771],
        [ 0.00455413, -0.00263517,  0.00814878, ..., -0.00263517,
         -0.01701376, -0.02779771]], dtype=float32)>,
 tensor([[ 0.0046, -0.0026,  0.0081,  ..., -0.0026, -0.0170, -0.0278],
         [ 0.0046, -0.0026,  0.0081,  ..., -0.0026, -0.0170, -0.0278]]))

In [4]:
tf_model = Wav2Vec2ForCTC.from_pretrained("/Users/vasudevgupta/Local/wav2vec2/wav2vec2-base-960h", input_shape=batch.shape)

Loading weights locally from `/Users/vasudevgupta/Local/wav2vec2/wav2vec2-base-960h`
Total number of loaded variables: 212


In [5]:
hf_model = HFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tf_out = tf_model(batch, training=False)["logits"]

In [7]:
with torch.no_grad():
    hf_out = hf_model(hf_batch)["logits"]

In [8]:
tf_out.shape, hf_out.shape

(TensorShape([2, 145, 32]), torch.Size([2, 145, 32]))

In [9]:
print("difference in logits:", get_difference(tf_out, hf_out))

difference in logits: 0.003186226


In [10]:
def tf_forward(*args, **kwargs):
    return tf_model(*args, **kwargs)
tf_forward = tf.function(tf_forward, autograph=True, jit_compile=True)

In [16]:
tf_out = tf_forward(batch, training=False)["logits"]

In [17]:
print("difference in graph based model logits:", get_difference(tf_out, hf_out))

difference in graph based model logits: 0.0023155212


In [5]:
tf.saved_model.save(tf_model, "dummy")


FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.
INFO:tensorflow:Assets written to: dummy/assets
INFO:tensorflow:Assets written to: dummy/assets


In [7]:
model = tf.saved_model.load("dummy")

In [9]:
@tf.function(autograph=True, jit_compile=True)
def tf_foward(*args, **kwargs):
    return model(*args, **kwargs)

In [10]:
tf_foward(batch)

{'logits': <tf.Tensor: shape=(2, 145, 32), dtype=float32, numpy=
 array([[[ 13.001484 , -26.905651 , -26.621664 , ...,  -6.4049616,
           -8.698974 ,  -7.6948757],
         [ 13.019491 , -26.890331 , -26.60231  , ...,  -6.399632 ,
           -8.719545 ,  -7.7269363],
         [ 13.018565 , -26.89125  , -26.603218 , ...,  -6.4000196,
           -8.720039 ,  -7.727183 ],
         ...,
         [ 13.018611 , -26.891388 , -26.603424 , ...,  -6.399854 ,
           -8.719286 ,  -7.728012 ],
         [ 13.016644 , -26.893538 , -26.605778 , ...,  -6.3998675,
           -8.715607 ,  -7.730587 ],
         [ 13.01872  , -26.890413 , -26.602463 , ...,  -6.399314 ,
           -8.718278 ,  -7.7274504]],
 
        [[ 13.001543 , -26.905685 , -26.6217   , ...,  -6.4050837,
           -8.699371 ,  -7.6951613],
         [ 13.0193405, -26.8908   , -26.60276  , ...,  -6.3998156,
           -8.72005  ,  -7.7267733],
         [ 13.018505 , -26.890938 , -26.602905 , ...,  -6.3999214,
           -8.72013

In [1]:
import sys
sys.path.append("../src")

from wav2vec2.tensorflow_addons import Conv1DWithWeightNorm
import torch.nn as nn
import tensorflow as tf
import numpy as np
import torch

In [2]:
bsz = 2
seqlen = 128
c_in = 32

array = np.random.uniform(size=(bsz, seqlen, c_in))
torch_tensor = torch.from_numpy(array.reshape(bsz, c_in, seqlen)).float()
tf_tensor = tf.convert_to_tensor(array, dtype=tf.float32)

tf_tensor

(TensorShape([2, 128, 32]), torch.Size([2, 32, 128]))

In [14]:
filters = 32
kernal_size = 3
padding = 1
num_groups = 2

tf_layer = Conv1DWithWeightNorm(filters, kernal_size, padding=padding, groups=num_groups)

torch_layer = nn.Conv1d(c_in, filters, kernal_size, padding=padding, groups=num_groups)
torch_layer = nn.utils.weight_norm(torch_layer, dim=2)

In [16]:
torch_layer.weight_v.data = torch.tensor(np.transpose(tf_layer.variables[1].numpy(), axes=(2, 1, 0)))
torch_layer.weight_v.shape, tf_layer.variables[1].shape

(torch.Size([32, 16, 3]), TensorShape([3, 16, 32]))

In [17]:
torch_layer.bias.data = torch.tensor(tf_layer.variables[0].numpy())
torch_layer.bias.shape, tf_layer.variables[0].shape

(torch.Size([32]), TensorShape([32]))

In [18]:
torch_layer.weight_g.data = torch.tensor(np.transpose(tf_layer.variables[2].numpy(), axes=(2, 1, 0)))
torch_layer.weight_g.shape, tf_layer.variables[2].shape

(torch.Size([1, 1, 3]), TensorShape([3, 1, 1]))

In [26]:
np.max(torch_layer.weight_v.data.numpy() - np.transpose(tf_layer.variables[1].numpy(), axes=(2, 1, 0)))

0.0

In [27]:
np.max(torch_layer.weight_g.data.numpy() - np.transpose(tf_layer.variables[2].numpy(), axes=(2, 1, 0)))

0.0

In [29]:
np.max(torch_layer.bias.data.numpy() - tf_layer.variables[0].numpy())

0.0

In [19]:
torch_layer(torch_tensor).reshape(bsz, seqlen, filters)

tensor([[[ 0.1787,  0.7230,  0.5517,  ...,  0.7510,  0.7255,  0.6659],
         [ 0.7125,  0.6189,  1.0440,  ...,  0.6942,  0.6147,  0.9643],
         [ 1.2329,  0.8827,  0.4978,  ...,  0.6464,  0.4920,  0.7446],
         ...,
         [-0.0707,  0.0908,  0.4706,  ..., -0.1495,  0.2757,  0.4261],
         [-0.0257,  0.1898, -0.0582,  ..., -0.3518,  0.1397,  0.0668],
         [ 0.0838, -0.0829, -0.1598,  ...,  0.0656,  0.3036,  0.0909]],

        [[ 0.1115,  0.3418,  0.8315,  ...,  0.7629,  0.9815,  1.0269],
         [ 1.0417,  0.8232,  0.9397,  ...,  1.0665,  0.7630,  0.4970],
         [ 0.9158,  0.8062,  0.6307,  ...,  0.9875,  0.5777,  0.8125],
         ...,
         [ 0.7536, -0.0471,  0.2242,  ..., -0.0166,  0.3181, -0.1743],
         [ 0.3607, -0.1040,  0.1540,  ..., -0.2781, -0.3233,  0.0557],
         [ 0.3777,  0.2054,  0.2909,  ..., -0.0480,  0.3015, -0.2903]]],
       grad_fn=<ViewBackward>)

In [20]:
tf_layer(tf_tensor)

<tf.Tensor: shape=(2, 128, 32), dtype=float32, numpy=
array([[[ 0.25438696,  0.28840485, -0.23035572, ...,  0.63323516,
         -0.43268305,  0.04219631],
        [ 0.3234304 ,  0.01674652, -0.1909513 , ...,  0.19587451,
         -0.26870194,  0.08470433],
        [ 0.70794016,  0.40538904,  0.13454597, ...,  0.31508467,
         -0.75723   ,  0.5067508 ],
        ...,
        [ 0.93924737,  0.50122154,  0.07898648, ...,  0.6624937 ,
         -0.2068775 , -0.15866959],
        [ 0.7211016 ,  0.3214118 ,  0.06242212, ...,  0.47061378,
         -0.6939048 ,  0.1447068 ],
        [ 0.8113076 , -0.13959275, -0.25087345, ...,  0.30376172,
          0.09948477, -0.11622235]],

       [[ 0.16744787,  0.64003223, -0.0977418 , ...,  0.47504246,
         -0.33441946, -0.16122884],
        [ 0.55097157,  0.13681547, -0.44116858, ...,  0.744997  ,
         -0.5349021 ,  0.05508609],
        [ 1.066014  ,  0.24528717, -0.07021946, ...,  0.33745512,
         -0.11795261,  0.10919161],
        ...,
