In [5]:
%%capture
!pip install datasets==1.18.3
!pip install transformers==4.17.0
!pip install jiwer

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
%%capture
!apt install git-lfs

# Preparing Dataset

In [8]:
from datasets import load_dataset, load_metric

timit = load_dataset("timit_asr")

timit = timit.remove_columns(["phonetic_detail", "word_detail", "dialect_region", "id", "sentence_type", "speaker_id"])

Downloading:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading and preparing dataset timit_asr/clean (download: 828.75 MiB, generated: 7.90 MiB, post-processed: Unknown size, total: 836.65 MiB) to /root/.cache/huggingface/datasets/timit_asr/clean/2.0.1/b11b576ddcccbcefa7c9f0c4e6c2a43756f3033adffe0fb686aa61043d0450ad...


Downloading:   0%|          | 0.00/869M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset timit_asr downloaded and prepared to /root/.cache/huggingface/datasets/timit_asr/clean/2.0.1/b11b576ddcccbcefa7c9f0c4e6c2a43756f3033adffe0fb686aa61043d0450ad. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

In [10]:
timit = timit.map(remove_special_characters)



0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [11]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(timit["train"]))

print(timit["train"][rand_int]["text"])
ipd.Audio(data=np.asarray(timit["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

a lone star shone in the early evening sky 


# Evaluate Models

In [13]:
import torch, torchaudio, copy
from transformers import Wav2Vec2Processor

device = "cpu"
#Will be using cpu as quantized models have limited support on gpu currently
print(device)

torch.backends.quantized.engine = 'qnnpack'

traced_model_path = "/content/sample_data/wav2vec2_trace.pt"
scripted_quantized = torch.jit.load(traced_model_path).to(device)

processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-base-timit-demo-google-colab")

cpu


Downloading:   0%|          | 0.00/215 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/217 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
def map_to_result2(batch_tensor, model):

  memory_usage = 0
  # torch.no_grad() ensures no gradients are changed during inferencing
  with torch.no_grad():
      # Since we are using raw waveform as input, no need to process through the processor
      logits = model(batch_tensor.unsqueeze(0))["logits"]
      memory_usage = get_memory_usage()

  pred_ids = torch.argmax(logits, dim=-1)
  predicted_text = processor.batch_decode(pred_ids)[0]


  return predicted_text, memory_usage



In [15]:
def calculate_wer(reference, hypothesis):
  ref_words = reference.split()
  hyp_words = hypothesis.split()
  # Counting the number of substitutions, deletions, and insertions
  substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
  #zip will tie the words together and generates 1 for every word difference

  deletions = len(ref_words) - len(hyp_words)
  insertions = len(hyp_words) - len(ref_words)
  # Total number of words in the reference text
  total_words = len(ref_words)
  # Calculating the Word Error Rate (WER)
  wer = (substitutions + deletions + insertions) / total_words
  #print(substitutions)
  #print(deletions)
  #print(insertions)
  #print(len(ref_words))
  #print(wer*total_words)

  return wer

reference_text = "We wanted people to know that we've got something brand new and essentially this product is uh what we call disruptive changes the way that people interact with technology"
hypothesis_text = "We wanted people to know that to me where i know and essentially this product is uh what we call scripted changes the way that people are rapid technology"

wer_score = calculate_wer(reference_text, hypothesis_text)
print("Word Error Rate (WER):", wer_score)



Word Error Rate (WER): 0.27586206896551724


In [21]:
import psutil

# Function to get CPU memory usage
def get_memory_usage():
    # Get virtual memory usage
    virtual_memory = psutil.virtual_memory()
    total_memory = virtual_memory.total / (1024 ** 3)  # Convert bytes to gigabytes
    available_memory = virtual_memory.available / (1024 ** 3)  # Convert bytes to gigabytes
    used_memory = virtual_memory.used / (1024 ** 3)  # Convert bytes to gigabytes

    #print(f"Total Memory: {total_memory:.2f} GB")
    #print(f"Available Memory: {available_memory:.2f} GB")
    #print(f"Used Memory: {used_memory:.2f} GB")
    return used_memory

# Call the function to get memory usage
get_memory_usage()


2.7454910278320312

In [17]:
def inference(inference_index, model):

    wav_file = np.asarray(timit["test"][inference_index]["audio"]["array"])
    reference_text = timit["test"][inference_index]["text"]

    input = processor(wav_file, sampling_rate=target_sample_rate, return_tensors="pt", padding=True).to(device)
    input_tensor = input["input_values"].squeeze().to(device)

    #input_values = torch.tensor(input["input_values"], device=device).unsqueeze(0)
    #print("classic tensor: ", input_tensor)
    #print("torch tensor: ", input_values)

    predicted_text, memory_usage = map_to_result2(input_tensor, model)

    return reference_text, predicted_text, memory_usage

In [23]:
from tqdm.notebook import tqdm

def evaluateModel(total_samples, model):

  list_error_index = []
  total_wer = 0
  total_memory_usage = 0

  for current_index in tqdm(range(total_samples)):
    reference_text, predicted_text, memory_usage = inference(current_index, model)
    #predicted_text = map_to_result(input, scripted_quantized)
    #predicted_text = predicted_text + " "

    #print("reference text: ", reference_text)
    #print("predicted text: ", predicted_text)
    comparison_results = calculate_wer(reference_text, predicted_text)
    total_wer += comparison_results
    total_memory_usage += memory_usage

    if(comparison_results != 0):
      list_error_index.append(current_index)

  print("wer: ", total_wer/total_samples)
  print("Evaluated on {:.3f} samples".format(total_samples))
  print(f"Average memory usage: {total_memory_usage/total_samples:.2f} GB")

  return list_error_index



In [30]:
from transformers import Wav2Vec2ForCTC

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"

target_sample_rate = processor.feature_extractor.sampling_rate

model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-timit-demo-google-colab").to(device)

#total_sample = timit["test"].num_rows
total_sample = 100

error_list_large = evaluateModel(total_sample, model)

print(error_list_large)

  0%|          | 0/100 [00:00<?, ?it/s]

wer:  0.23501920301920304
Evaluated on 100.000 samples
Average memory usage: 3.95 GB
[0, 4, 5, 6, 7, 8, 9, 13, 15, 16, 17, 18, 19, 20, 21, 25, 26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 45, 46, 47, 48, 49, 50, 51, 55, 56, 57, 58, 59, 60, 61, 62, 63, 66, 68, 69, 71, 72, 75, 76, 77, 81, 83, 85, 86, 87, 88, 89, 90, 92, 93, 96, 98, 99]


In [73]:
target_index = 9

reference_text, predicted_text = inference(target_index, model)

print("reference_text: ", reference_text)
print("predicted_text: ", predicted_text)

reference_text:  this group is secularist and their program tends to be technological 
predicted_text:  this group is secgularist and their progream tends to be technological


In [31]:
target_sample_rate = processor.feature_extractor.sampling_rate

total_wer = 0
total_sample = 100
#total_sample = timit["test"].num_rows

device = "cpu"

error_list_quantized = evaluateModel(total_sample, scripted_quantized)




  0%|          | 0/100 [00:00<?, ?it/s]

wer:  0.24199711399711393
Evaluated on 100.000 samples
Average memory usage: 4.34 GB
