<a href="https://colab.research.google.com/github/uigiporc/icon-sr/blob/main/Speech_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initializing the environement


##Check VM region

## **`If the region is not us-central1, don't proceed with the model `**

In [None]:
import requests
import ipaddress
import time

def check_region(ip):
    response = requests.get("https://www.gstatic.com/ipranges/cloud.json")
    data = response.json()
    try:
      for prefix in data['prefixes']:
        if 'ipv4Prefix' in prefix:
          if ipaddress.IPv4Address(ip) in ipaddress.IPv4Network(prefix['ipv4Prefix']):
            print(f"Found network address for {ip}: {prefix['ipv4Prefix']} Region: {prefix['scope']}")

            #Whenever we don't get a TPU in US-CENTRAL1, we kill the process for data transfer costs reasons, since the bucket is located in that region
            if 'us-central1' not in prefix['scope']:
              raise ValueError(f"Region found: {prefix['scope']}, Desidered region: us-central1")
            return
      raise ValueError(f"Can't find network for this machine with ip {ip}")
    except Exception as e:
      print(e)
      print("Killing the process in 5 seconds.")
      time.sleep(5)
      !kill -9 -1

ip = requests.get('https://ipecho.net/plain').text
check_region(ip)

Found network address for 35.188.133.199: 35.188.128.0/18 Region: us-central1


## Resolve dependancies

In [None]:
!pip install jiwer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from jiwer import wer
import os

In [None]:
# Equals to BATCH_SIZE if training on CPU/GPU, BATCH_SIZE*8 when training on TPU
BATCH_SIZE = 16

# Set to True if using a Google Cloud Bucket as your dataset source. Mandatory when training on TPU.
USE_GCS = True

# Base bucket path
GCS_BUCKET_PATH = "gs://progetto-icon-cucinotta-porcelli"

# Google Cloud project name used to sync permissions
GCS_PROJECT_NAME = 'progetto-icon'

## Initialize backend (TPU/GPU)

In [None]:
TPU_FLAG = False

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  assert USE_GCS
  TPU_FLAG = True
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  print('No TPU found. Falling back to GPU')
  pass
except AssertionError:
  raise BaseException('A Google Cloud Storage bucket must be specified when using a TPU. Local files are not supported.')

if TPU_FLAG:
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  tpu_strategy = tf.distribute.TPUStrategy(tpu)
else:
  gpus = tf.config.list_physical_devices('GPU')
  if gpus:
    try:
      # Currently, memory growth needs to be the same across GPUs
      for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
      # Memory growth must be set before GPUs have been initialized
      print(e)
  else:
    print('No GPU found. Falling back to CPU (expect very long training times)')

### Authenticate to Google Cloud and set a project
### Download dataset from drive if cannot authenticate to GCS (Cloud Storage Bucket)

In [None]:
from google.colab import drive, auth

print("Authenticating Colab session:")
auth.authenticate_user()

if USE_GCS:
  print("Authenticating to Google Cloud to use GCS:")
  #https://stackoverflow.com/questions/61448884/how-to-connect-to-private-storage-bucket-using-the-google-colab-tpu
  !gcloud config set project {GCS_PROJECT_NAME}
  project_name = !gcloud config get-value project
  print(f"Connected to {project_name} project")
  try: 
    tf.io.gfile.exists(GCS_BUCKET_PATH)
  except:
    raise ValueError('Bucket name does not exist')
else:
  print("Connecting to Google Drive:")
  drive.mount('/content/drive')
  # make space and create a local folder for the dataset for better I/O performances
  !rm -r sample_data/
  !mdkir dataset
  !mkdir checkpoints

Authenticating Colab session:
Authenticating to Google Cloud to use GCS:
Updated property [core/project].
Connected to ['progetto-icon'] project


# Processing functions

## Dataset Utilities

In [None]:
def parse_tfr_audio_element(element):
    feats = {
        'spectrogram': tf.io.FixedLenFeature([], tf.string),
        'text': tf.io.FixedLenFeature([], tf.string)
    }

    content = tf.io.parse_single_example(element, feats)

    spectrogram = tf.io.parse_tensor(content['spectrogram'], tf.float32) # note that we change the data type to float32
    text = tf.io.parse_tensor(content['text'], tf.int64)

    return spectrogram, text

def load_dataset(train_files, batch_size=BATCH_SIZE, buf_size=5000):
  # disable order, increase speed
  ignore_order = tf.data.Options()
  ignore_order.experimental_deterministic = False

  dataset = tf.data.TFRecordDataset(filenames=train_files).with_options(ignore_order)
  dataset = dataset.map(parse_tfr_audio_element, num_parallel_calls=tf.data.AUTOTUNE)
  # don't cache large datasets that don't fit in memory
  #ds_train = ds_train.cache()

  # shuffle the entire dataset to get good randomization 
  # we get true randomization if buffer_size >= # of examples in dataset
  # when dealing with big/huge datasets, loading the full dataset into ram and shuffling might not be possible
  dataset = dataset.shuffle(buffer_size=buf_size)

  # pad to the closest multiple of 128 that fits our data for better TPU performances
  # pad spectrogram to 3328 timesteps and label to 640 characters
  dataset = dataset.padded_batch(batch_size, padded_shapes=([3328,128], [640]), drop_remainder=True)

  #improve pipeline performance
  dataset = dataset.prefetch(tf.data.AUTOTUNE)
  return dataset

## Load datasets

In [None]:
# The set of characters accepted in the transcription.
characters = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ' "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

batch_size = BATCH_SIZE

if USE_GCS:
  dataset_filenames = tf.io.gfile.glob(GCS_BUCKET_PATH + "/librispeech-train-360-clean/librispeech-train-clean-360.tfrecord*")
  test_files = tf.io.gfile.glob(GCS_BUCKET_PATH + "/test-clean/librispeech-test-clean.tfrecord*")
else:
  drive_filenames = tf.io.gfile.glob("/content/drive/MyDrive/Datasets/Librispeech/dev-clean/*.tfrecord")
  !cp {drive_filenames} /content/dataset
  dataset_filenames = tf.io.gfile.glob("/content/dataset/*.tfrecord")

# 80/20 split for train/eval dataset
split_ind = int(0.8 * len(dataset_filenames))
training_files, eval_files = dataset_filenames[:split_ind], dataset_filenames[split_ind:]

print("Train TFRecord Files:", len(training_files))
print("Validation TFRecord Files:", len(eval_files))
print("Test TFRecord Files", len(test_files))
#training_files = [train_filenames]              
#eval_files = [validation_filenames]

if TPU_FLAG:
  print(f"Detected TPU with {tpu_strategy.num_replicas_in_sync} cores, increasing batch size for efficiency")
  batch_size = BATCH_SIZE * tpu_strategy.num_replicas_in_sync

ds_train = load_dataset(training_files, batch_size)
ds_eval = load_dataset(eval_files, batch_size)
ds_test = load_dataset(test_files, batch_size)

print(f"Effective batch size: {batch_size}")

Train TFRecord Files: 80
Validation TFRecord Files: 20
Test TFRecord Files 10
Detected TPU with 8 cores, increasing batch size for efficiency
Effective batch size: 128


### Test if the dataset has been processed and loaded correctly

In [None]:
#for batch in ds_train.take(1):
#    print(batch)
#    spectrogram = batch[0][0].numpy()
#    print(batch[0][0])
#    spectrogram = np.array([np.trim_zeros(x) for x in np.transpose(spectrogram)])
#    label = batch[1][0]
#    # Spectrogram
#    print(label)

##Custom subclass of ModelCheckpoint 
##At the end of each epoch, calculates and prints the Word Error Rate, then saves the model as a tf format checkpoint

In [None]:
# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

class WERCheckpoint(keras.callbacks.ModelCheckpoint):
  
  def __init__(self, filepath, dataset, verbose=0, save_best_only=False,
    save_weights_only=False, mode='auto', save_freq='epoch',
    options=None):
    super().__init__(filepath = filepath,
                         verbose = verbose,
                         save_best_only = save_best_only,
                         save_weights_only = save_weights_only,
                         mode = mode,
                         save_freq = save_freq,
                         options = options)
    self.dataset = dataset

  def _print_wer(self, wer, predictions, targets):
    print("-" * 100)
    print(f"Word Error Rate: {wer:.4f}")
    print("-" * 100)
    for i in np.random.randint(0, len(predictions), 2):
      print(f"Target    : {targets[i]}")
      print(f"Prediction: {predictions[i]}")
      print("-" * 100)

  def _save_model(self, epoch, batch, logs):
    """Saves the model.

    Args:
        epoch: the epoch this iteration is in.
        batch: the batch this iteration is in. `None` if the `save_freq`
          is set to `epoch`.
        logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`.
    """
    logs = logs or {}

    if isinstance(self.save_freq,
                  int) or self.epochs_since_last_save >= self.period:
      # Block only when saving interval is reached.
      self.epochs_since_last_save = 0
      filepath = self._get_file_path(epoch, batch, logs)

      try:
        predictions = []
        targets = []
        for batch in self.dataset.take(5):
          X, y = batch
          pred = model.predict(X)
          batch_predictions = decode_batch_predictions(pred) 
          predictions.extend(batch_predictions)
          for label in y:
            label = (
                tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
                )
            targets.append(label)
          
        if self.save_best_only:
          current = wer(targets, predictions)
          if self.monitor_op(current, self.best):
            if self.verbose > 0:
              print('\nEpoch %05d: WER improved from %0.5f to %0.5f,'
              ' saving model to %s' % (epoch + 1,
                                       self.best, current, filepath))
              self._print_wer(current, predictions, targets)
              self.best = current
              if self.save_weights_only:
                self.model.save_weights(
                    filepath, overwrite=True, options=self._options)
              else:
                self.model.save(filepath, overwrite=True, options=self._options)
            else:
              if self.verbose > 0:
                print('\nEpoch %05d: WER did not improve from %0.5f' %
                      (epoch + 1, self.best))
                self._print_wer(current, predictions, targets)
        else:
          if self.verbose > 0:
            print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
            current = wer(targets, predictions)
            self._print_wer(current, predictions, targets)
          if self.save_weights_only:
            self.model.save_weights(
                filepath, overwrite=True, options=self._options)
          else:
            self.model.save(filepath, overwrite=True, options=self._options)

        self._maybe_remove_file()
      except IsADirectoryError as e:  # h5py 3.x
        raise IOError('Please specify a non-directory filepath for '
                      'ModelCheckpoint. Filepath used is an existing '
                      'directory: {}'.format(filepath))
      except IOError as e:  # h5py 2.x
        # `e.errno` appears to be `None` so checking the content of `e.args[0]`.
        if 'is a directory' in str(e.args[0]).lower():
          raise IOError('Please specify a non-directory filepath for '
                        'ModelCheckpoint. Filepath used is an existing '
                        'directory: {}'.format(filepath))
        # Re-throw the error for any other causes.
        raise e

## Defining the Callbacks

In [None]:
if USE_GCS:
  checkpoint_filepath = GCS_BUCKET_PATH + '/360h_checkpoints/saved_cloud_model.{epoch:03d}'
else:
  checkpoint_filepath = '/content/drive/MyDrive/Datasets/Librispeech/dev-clean/checkpoint/saved_model.{epoch:03d}'

custom_cb = WERCheckpoint(filepath=checkpoint_filepath,
                          dataset=ds_eval,
                          verbose = 1,
                          save_weights_only=False,
                          save_best_only=False)

# Training

### Define custom loss function based on Connectionist Temporal Classification (CTC)

In [None]:
def CTCLossNew(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    logit_length = tf.fill([tf.shape(y_pred)[0]], tf.shape(y_pred)[1])
    label_length = tf.fill([tf.shape(y_true)[0]], tf.shape(y_true)[1])

    # the input must not be softmaxed
    y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-7)

    loss = tf.nn.ctc_loss(
            labels=y_true,
            logits=y_pred,
            label_length=label_length,
            logit_length=logit_length,
            logits_time_major=True,
            blank_index=-1)
    
    return tf.expand_dims(loss,1)

###Define model and how to load previous checkpoint

In [None]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    """Model similar to DeepSpeech2."""
    # Model's input
    input_spectrogram = layers.Input((3328, input_dim), name="input")
    # Expand the dimension to use 2D CNN.
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    # Convolution layer 1
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    # Convolution layer 2
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[1, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    # RNN layers
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLossNew)
    return model

# This function loads the model (including the state of the optimizer) and weights
def get_model_from_cp(path):
  if tf.io.gfile.listdir(path):
    print(f"Checkpoint found. Loading...")
    model = keras.models.load_model(os.path.join(path,tf.io.gfile.listdir(path)[-1]), custom_objects={ 'CTCLossNew': CTCLossNew })
    #print(tf.io.gfile.listdir(path)[-1])
    epochs = int(tf.io.gfile.listdir(path)[-1][-4:-1])
    print(f"Restored model from checkpoint {epochs}")
  else:
    print(f"Could not find checkpoint in {path}. Building new model:")
    model = build_model(input_dim=128, output_dim=char_to_num.vocabulary_size(), rnn_units=256)
    epochs = 0

  return model, epochs

In [None]:
if TPU_FLAG:
  # we need to instantiate and compile the model, optimizer and metrics in the TPU scope
  with tpu_strategy.scope():
    print("Compiling model for TPU")
    model, last_epoch = get_model_from_cp('gs://progetto-icon-cucinotta-porcelli/360h_checkpoints')
else:
  print("Compiling model for GPU/CPU")
  #model, last_epoch = get_model_from_cp('/content/drive/MyDrive/Datasets/Librispeech/dev-clean/checkpoint/')
  model, last_epoch = get_model_from_cp('gs://progetto-icon-cucinotta-porcelli/360h_checkpoints')

model.summary()

Compiling model for TPU
Checkpoint found. Loading...
Restored model from checkpoint 78
Model: "DeepSpeech_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 3328, 128)]       0         
_________________________________________________________________
expand_dim (Reshape)         (None, 3328, 128, 1)      0         
_________________________________________________________________
conv_1 (Conv2D)              (None, 1664, 64, 32)      14432     
_________________________________________________________________
conv_1_bn (BatchNormalizatio (None, 1664, 64, 32)      128       
_________________________________________________________________
conv_1_relu (ReLU)           (None, 1664, 64, 32)      0         
_________________________________________________________________
conv_2 (Conv2D)              (None, 1664, 32, 32)      236544    
_________________________________

# Optional monitoring with TensorBoard
### Refer to **[THIS LINK](https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/profiling_tpus_in_colab.ipynb#scrollTo=erj2-mPpvqbG)** for proper usage

In [None]:
#https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/profiling_tpus_in_colab.ipynb
!pip install -U tensorboard-plugin-profile
%load_ext tensorboard

# Get TPU profiling service address. This address will be needed for capturing
# profile information with TensorBoard in the following steps.
service_addr = tpu.get_master().replace(':8470', ':8466')
print(service_addr)

%tensorboard --logdir logs

## Train model

In [None]:
# Define the number of epochs.
epochs = 20 + last_epoch
# Callback function to check transcription on the val set.
tf.debugging.set_log_device_placement(True)
# Train the model
history = model.fit(
    ds_train,
    initial_epoch=last_epoch,
    validation_data=ds_eval,
    epochs=epochs,
    callbacks=[custom_cb],
)

# Test model



## Test WER

In [None]:
if TPU_FLAG:
  with tpu_strategy.scope():
    # We don't need to compile the model when doing inference
    model = keras.models.load_model('gs://progetto-icon-cucinotta-porcelli/360h_checkpoints/saved_cloud_model.079', compile=False)
    number_of_examples = 0
    temp_wer = 0
    i=0
    predictions = []
    targets = []
    # test the whole dataset
    for batch in ds_test:

      print(f"Processing batch #{i}")
      i = i+1

      #don't fill the RAM with predictions
      if len(targets) >= 512:
        #calculate weighted average
        temp_wer = temp_wer + wer(targets, predictions) * len(targets)
        number_of_examples = number_of_examples + len(targets)
        
        #unload the RAM
        predictions.clear()
        targets.clear()
    
      X, y = batch
      pred = model.predict(X)
      batch_predictions = decode_batch_predictions(pred) 
      predictions.extend(batch_predictions)
      for label in y:
        label = (tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8"))
        targets.append(label)

    # length of test dataset is not an exact multiple of 512
    if len(targets) > 0:
      temp_wer = temp_wer + wer(targets, predictions) * len(targets)
      number_of_examples = number_of_examples + len(targets)

    final_wer = temp_wer / number_of_examples
    print(f"Number of examples in test dataset: {number_of_examples}")
    print(f"WER on test: {final_wer}")