In [None]:
import importlib

if importlib.util.find_spec('transformers') is None:
    print('Installing transformers...')
    ! pip install transformers
    ! pip install pyctcdecode
    ! pip install https://github.com/kpu/kenlm/archive/master.zip
else:
    print('libraries is already installed')

In [None]:
from pathlib import Path
# from scipy.io import wavfile
# import scipy.signal
import pandas as pd
from tqdm.auto import tqdm
# import seaborn as sns
# import matplotlib.pyplot as plt
# from collections import Counter
import numpy as np
# import os
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
# from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization
# from tensorflow.keras.models import Sequential
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.regularizers import l1, l2
from transformers import AutoProcessor, TFWav2Vec2Model
from tensorflow.keras import mixed_precision

## Mount Drive

In [None]:
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

ON_COLAB = is_running_on_colab()
ON_COLAB

True

In [None]:
if ON_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive')
  intermediate_folder = Path('/content/gdrive/MyDrive/Colab Notebooks/Speech recognition')
  # intermediate_folder = Path('/content/gdrive/MyDrive/Temp/Speech recognition project')
else:
  intermediate_folder = Path('..') / 'data' / 'intermediate'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Read data

In [None]:
SAMPLE_RATE = 16000

In [None]:
# model_name = 'jonatasgrosman/wav2vec2-large-xlsr-53-english'
model_name = 'facebook/wav2vec2-base-960h'
processor = AutoProcessor.from_pretrained(model_name)
model = TFWav2Vec2Model.from_pretrained(model_name)


TFWav2Vec2Model has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tine this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2Model: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing TFWav2Vec2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFWav2Vec2Model were not initialized from the PyTorch model and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

In [None]:
X_train = np.load(intermediate_folder / 'train_main_1_sec_audio.npy')
X_train.shape

(33566, 16000)

In [None]:
X_train_input = processor(X_train, return_tensors="tf", sampling_rate=SAMPLE_RATE).input_values
del X_train
X_train_input.shape

TensorShape([33566, 16000])

In [None]:
@tf.function(jit_compile=True)
def predict_fn(model, input_data):
    return model(input_data)['last_hidden_state']

In [None]:
mixed_precision.set_global_policy('mixed_float16')

In [None]:
X_train_output_list = []
for i in tqdm(range(len(X_train_input))):
    X_train_output_list.append(predict_fn(model, tf.reshape(X_train_input[i], (1, -1))))

# Combine all batch predictions
X_train_output = np.concatenate(X_train_output_list)
del X_train_input
del X_train_output_list
X_train_output.shape

  0%|          | 0/33566 [00:00<?, ?it/s]

(33566, 49, 768)

In [None]:
# BATCH_SIZE = 32  # Define your batch size

# X_train_output_list = []
# for i in tqdm(range(0, len(X_train_input), BATCH_SIZE)):
#     batch_data = X_train_input[i : i + BATCH_SIZE]
#     batch_pred = predict_fn(model, batch_data)
#     X_train_output_list.append(batch_pred)

# # Combine all batch predictions
# X_train_output = np.concatenate(X_train_output_list)
# # del X_train_input
# # del X_train_output_list
# X_train_output.shape

In [None]:
np.save(intermediate_folder / 'train_main_1_sec_wav2vec_emb.npy', X_train_output)
del X_train_output

In [None]:
X_val = np.load(intermediate_folder / 'val_main_1_sec_audio.npy')
X_val.shape

(4619, 16000)

In [None]:
X_val_input = processor(X_val, return_tensors="tf", sampling_rate=SAMPLE_RATE).input_values
del X_val
X_val_input.shape

TensorShape([4619, 16000])

In [None]:
# X_val_output = model.predict(X_val_input, batch_size=BATCH_SIZE)
# del X_val_input
# X_val_output.shape

In [None]:
X_val_output_list = []
for i in tqdm(range(len(X_val_input))):
    X_val_output_list.append(predict_fn(model, tf.reshape(X_val_input[i], (1, -1))))

# Combine all batch predictions
X_val_output = np.concatenate(X_val_output_list)
del X_val_input
del X_val_output_list
X_val_output.shape

  0%|          | 0/4619 [00:00<?, ?it/s]

(4619, 49, 768)

In [None]:
np.save(intermediate_folder / 'val_main_1_sec_wav2vec_emb.npy', X_val_output)
del X_val_output

In [None]:
X_test = np.load(intermediate_folder / 'test_main_1_sec_audio.npy')
X_test.shape

(4689, 16000)

In [None]:
X_test_input = processor(X_test, return_tensors="tf", sampling_rate=SAMPLE_RATE).input_values
del X_test
X_test_input.shape

TensorShape([4689, 16000])

In [None]:
# X_test_output = model.predict(X_test_input, batch_size=BATCH_SIZE)
# del X_test_input
# X_test_output.shape

In [None]:
X_test_output_list = []
for i in tqdm(range(len(X_test_input))):
    X_test_output_list.append(predict_fn(model, tf.reshape(X_test_input[i], (1, -1))))

# Combine all batch predictions
X_test_output = np.concatenate(X_test_output_list)
del X_test_input
del X_test_output_list
X_test_output.shape

  0%|          | 0/4689 [00:00<?, ?it/s]

(4689, 49, 768)

In [None]:
np.save(intermediate_folder / 'test_main_1_sec_wav2vec_emb.npy', X_test_output)
del X_test_output