In [1]:
!cp /content/drive/MyDrive/yesno_zoomasr_model.zip /content

In [2]:
!unzip /content/yesno_zoomasr_model.zip

Archive:  /content/yesno_zoomasr_model.zip
   creating: content/model/
 extracting: content/model/tokenizer_vocab.txt  
   creating: content/model/zoomasr/
 extracting: content/model/zoomasr/fingerprint.pb  
   creating: content/model/zoomasr/variables/
  inflating: content/model/zoomasr/variables/variables.data-00000-of-00001  
  inflating: content/model/zoomasr/variables/variables.index  
  inflating: content/model/zoomasr/saved_model.pb  
  inflating: content/model/zoomasr/keras_metadata.pb  
   creating: content/model/zoomasr/assets/


In [4]:

import json
import tensorflow as tf
from tensorflow import keras

# Load the tokenizer vocabulary from the JSON file
with open("content/model/tokenizer_vocab.txt", "r") as file:
    idx_to_char = file.read()

characters = [x for x in idx_to_char]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)

import numpy as np
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384


def encode_aud(wav_file):
    """
    audio file encoder
    params : wav audio file path
    return : spectogram
     """
    file = tf.io.read_file(wav_file)
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    return spectrogram


# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text


def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss




with keras.utils.custom_object_scope({'CTCLoss': CTCLoss}):
    loaded_model = tf.keras.models.load_model("content/model/zoomasr")




The vocabulary is: ['', '<', 'U', 'N', 'K', '>', 'ැ', ' ', 'ඔ', 'ව', '්', 'න', 'හ'] (size =13)


In [6]:

#############################################################
################### Give Audio Path And Run #################
#############################################################

aud = "/content/0_0_1_0_0_0_1_0.wav"

#############################################################


aud = encode_aud(aud)
aud = tf.expand_dims(aud, axis=0)
#Make preds by ai
pred = loaded_model.predict(aud)
#decode and detokenize it
pred = decode_batch_predictions(pred)
print(pred)


['නැහැ නැහැ ඔව් නැහැ නැහැ නැහැ ඔව් නැහැ']
