##Install required tools

In [None]:
# Installing whisper package
!pip install git+https://github.com/openai/whisper.git

#Insalloing datasets
!pip install datasets

# Cloning whisper repository
!git clone https://github.com/openai/whisper.git

# Downloading a sample audio file
!wget https://huggingface.co/datasets/osanseviero/dummy_ja_audio/resolve/main/result.flac

##Generate Whisper TFLite model

In [None]:
import tensorflow as tf


# Importing necessary classes from transformers
from transformers import AutoProcessor, TFWhisperForConditionalGeneration, GenerationConfig

# Importing necessary functions from datasets
from datasets import load_dataset


# Creating force_token_map to be used in GenerationConfig
force_token_map = [[50258, 50266], [50359, 50363]] #

# Creating generation_config with force_token_map
generation_config = GenerationConfig(force_token_map=force_token_map)

# Creating an instance of AutoProcessor from the pretrained model
processor = AutoProcessor.from_pretrained("openai/whisper-small")

# Creating an instance of TFWhisperForConditionalGeneration from the pretrained model
model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# Loading dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

# Inputs
inputs = processor(ds[0]["audio"]["array"], return_tensors="tf")
input_features = inputs.input_features

# Generating Transcription
generated_ids = model.generate(input_ids=input_features, generation_config=generation_config)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)

# Creating a GenerateModel Class
class GenerateModel(tf.Module):
  def __init__(self, model):
    super(GenerateModel, self).__init__()
    self.model = model

  @tf.function(
    input_signature=[
      tf.TensorSpec(shape=(1, 80,3000), dtype=tf.float32, name="input_ids"),
    ]
  )
  def serving(self, input_ids):
    outputs = self.model.generate(input_ids, forced_decoder_ids=force_token_map)
    return {"sequences": outputs}

# Saving the model
saved_model_dir = '/content/tf'
generate_model = GenerateModel(model=model)
tf.saved_model.save(generate_model, saved_model_dir, signatures={"serving_default": generate_model.serving})

# Converting to TFLite model
tflite_model_path = '/content/whisper-base.tflite'
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

# Saving the TFLite model
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)

All model checkpoint layers were used when initializing TFWhisperForConditionalGeneration.

All the layers of TFWhisperForConditionalGeneration were initialized from the model checkpoint at openai/whisper-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWhisperForConditionalGeneration for predictions without further training.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


 Mr. Quilter is the apostle of the middle classes, and we are


  return py_builtins.overload_of(f)(*args)


##Run the inference on Whisper TFLite model

In [None]:
# Import necessary libraries
import whisper
import numpy as np
from timeit import default_timer as timer


# Define the path to the TFLite model
tflite_model_path = '/content/whisper-base.tflite'

# Create an interpreter to run the TFLite model
interpreter = tf.lite.Interpreter(tflite_model_path)

# Allocate memory for the interpreter
interpreter.allocate_tensors()

# Get the input and output tensors
input_tensor = interpreter.get_input_details()[0]['index']
output_tensor = interpreter.get_output_details()[0]['index']


inference_start = timer()

# Calculate the mel spectrogram of the audio file
print(f'Calculating mel spectrogram...')
mel_from_file = whisper.audio.log_mel_spectrogram('/content/whisper/tests/jfk.flac')

# Pad or trim the input data to match the expected input size
input_data = whisper.audio.pad_or_trim(mel_from_file, whisper.audio.N_FRAMES)

# Add a batch dimension to the input data
input_data = np.expand_dims(input_data, 0)

# Run the TFLite model using the interpreter
print("Invoking interpreter ...")
interpreter.set_tensor(input_tensor, input_data)
interpreter.invoke()

# Get the output data from the interpreter
output_data = interpreter.get_tensor(output_tensor)

# Print the output data
#print(output_data)

# Create a tokenizer to convert tokens to text
wtokenizer = whisper.tokenizer.get_tokenizer(True, language="ja")

# convert tokens to text
print("Converting tokens ...")
for token in output_data:
    # Replace -100 with the end of text token
    token[token == -100] = wtokenizer.eot
    text = wtokenizer.decode(token, skip_special_tokens=True)
    print(text)

print("\nInference took {:.2f}s ".format(timer() - inference_start))

Calculating mel spectrogram...
Invoking interpreter ...
Converting tokens ...
 And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.

Inference took 12.02s 


In [None]:
%ls -la

total 122808
drwxr-xr-x 1 root root      4096 Jan 26 00:57 [0m[01;34m.[0m/
drwxr-xr-x 1 root root      4096 Jan 26 00:47 [01;34m..[0m/
drwxr-xr-x 4 root root      4096 Jan 24 14:37 [01;34m.config[0m/
-rw-r--r-- 1 root root     70166 Oct  7 14:23 result.flac
-rw-r--r-- 1 root root     70166 Oct  7 14:23 result.flac.1
drwxr-xr-x 1 root root      4096 Jan 24 14:38 [01;34msample_data[0m/
drwxr-xr-x 4 root root      4096 Jan 26 00:56 [01;34mtf[0m/
drwxr-xr-x 8 root root      4096 Jan 26 00:48 [01;34mwhisper[0m/
-rw-r--r-- 1 root root 125580904 Jan 26 00:57 whisper-base.tflite
