<a href="https://colab.research.google.com/github/xiaoyufan/speech-data-augmentation/blob/main/inference_and_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

### Create logger

In [183]:
from importlib import reload
import logging
import sys

reload(logging)

LOGGING_LEVEL = 'INFO'

logger = logging.getLogger('baseline')
logger.setLevel(LOGGING_LEVEL)

formatter = logging.Formatter('[%(asctime)s - logger %(name)s - %(levelname)s] %(message)s')

ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(formatter)
logger.addHandler(ch)

logger.debug('debug test')
logger.info('info test')

[2020-12-17 01:32:08,099 - logger baseline - INFO] info test


### Mount Google Drive

In [184]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Configurations

#### Get notebook's start time

In [185]:
from datetime import datetime, tzinfo
import pytz

NB_RUN_TIME = datetime.now(tz=pytz.timezone('US/Eastern')).strftime('%Y%m%d-%H%M%S')
logger.info(f'Notebook started at {NB_RUN_TIME}.')

[2020-12-17 01:32:08,443 - logger baseline - INFO] Notebook started at 20201216-203208.


#### Paths

In [186]:
EXPERIMENT_NAME = 'indian_added/20201216-185720-b8-e80'
PROJECT_ROOT_PATH = '/content/drive/MyDrive/nlp-project'

DATASET_PATH = f'{PROJECT_ROOT_PATH}/cmu_arctic'
TEST_DIR = f'{DATASET_PATH}/test'
WAV_TEST_DIR = f'{TEST_DIR}/audio'
TEST_FILES_PATH = f'{TEST_DIR}/test_files.csv'

INFERENCE_DIR = f'{PROJECT_ROOT_PATH}/xiaoyu-output/{EXPERIMENT_NAME}'
INFERENCE_LOG_LOCAL_PATH = '/content/inference.log'
INFERENCE_OUTPUT_DIR = f'{INFERENCE_DIR}/inferences'
INFERENCE_RESULTS_PATH = f'{INFERENCE_OUTPUT_DIR}/results.csv'
EVALUATION_RESULTS_PATH = f'{INFERENCE_OUTPUT_DIR}/evaluation.json'
MODEL_PATH = f'{INFERENCE_DIR}/models/output_graph.pb'

DEEPSPEECH_PATH = '/content/DeepSpeech'

### Create non-exist directories

In [187]:
!mkdir -p $INFERENCE_OUTPUT_DIR

### Install packages

In [188]:
%%bash -s "$DEEPSPEECH_PATH"
DEEPSPEECH_PATH=$1

if [ ! -d "$DEEPSPEECH_PATH" ] ; then
  git clone --branch v0.9.2 https://github.com/mozilla/DeepSpeech $DEEPSPEECH_PATH
fi

cd $DEEPSPEECH_PATH
pip install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0
pip install --upgrade -e .

# pip uninstall tensorflow -y
pip install --upgrade tensorflow==1.15.4
pip install tensorflow-gpu==1.15.4

# Install other python packages
pip install pandas
# tensorflow 1.15.4 requires numpy<1.19.0,>=1.16.0, but you'll have numpy 1.19.4 which is incompatible.
pip install --upgrade numpy==1.16.0

Requirement already up-to-date: pip==20.2.2 in /usr/local/lib/python3.6/dist-packages (20.2.2)
Requirement already up-to-date: wheel==0.34.2 in /usr/local/lib/python3.6/dist-packages (0.34.2)
Requirement already up-to-date: setuptools==49.6.0 in /usr/local/lib/python3.6/dist-packages (49.6.0)
Obtaining file:///content/DeepSpeech
Installing collected packages: deepspeech-training
  Attempting uninstall: deepspeech-training
    Found existing installation: deepspeech-training 0.9.2
    Can't uninstall 'deepspeech-training'. No files were found to uninstall.
  Running setup.py develop for deepspeech-training
Successfully installed deepspeech-training
Requirement already up-to-date: tensorflow==1.15.4 in /usr/local/lib/python3.6/dist-packages (1.15.4)
Requirement already up-to-date: numpy==1.16.0 in /usr/local/lib/python3.6/dist-packages (1.16.0)


In [189]:
%%bash

pip install --upgrade deepspeech-gpu==0.9.2

Requirement already up-to-date: deepspeech-gpu==0.9.2 in /usr/local/lib/python3.6/dist-packages (0.9.2)


#  Inference

## Run inference

In [190]:
%%bash -s "$MODEL_PATH" "$WAV_TEST_DIR" "$INFERENCE_LOG_LOCAL_PATH" "$INFERENCE_OUTPUT_DIR"
MODEL_PATH=$1
WAV_TEST_DIR=$2
INFERENCE_LOG_LOCAL_PATH=$3
INFERENCE_OUTPUT_DIR=$4

rm $INFERENCE_LOG_LOCAL_PATH

for file in $WAV_TEST_DIR/**/*.wav; do
  echo "=========================="
  echo "Running inference on $file"
  deepspeech --model $MODEL_PATH --audio $file
done 2>&1 | tee -a $INFERENCE_LOG_LOCAL_PATH

cp $INFERENCE_LOG_LOCAL_PATH $INFERENCE_OUTPUT_DIR/inference.log

Running inference on /content/drive/MyDrive/nlp-project/cmu_arctic/test/audio/female_40/arctic_a0161.wav
2020-12-17 01:32:29.355128: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
TensorFlow: v2.3.0-6-g23ad988
DeepSpeech: v0.9.2-0-gb2920c7
2020-12-17 01:32:29.478676: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-12-17 01:32:29.480361: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-12-17 01:32:29.513308: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retu

## Collect hypothesis

In [191]:
%%bash

pip install jiwer



In [192]:
import jiwer
import re

transformation = jiwer.Compose([
  jiwer.ToLowerCase(),
  jiwer.RemoveWhiteSpace(replace_by_space=True),
  jiwer.RemoveMultipleSpaces(),
  jiwer.Strip(),
]) 

PUNCTUATIONS_TO_REMOVE = re.compile(r'[!"#$%&()*\+,-./\\:;<=>?@\[\]^_`{|}~]')

def process_hypothesis(raw):
  processed = re.sub(PUNCTUATIONS_TO_REMOVE, ' ', raw)
  processed = transformation(processed)
  return processed

In [193]:
import pandas

from pathlib import Path

RESULTS_FILE_COLUMNS = ['wav_filename', 'hypothesis', 'ground_truth', 'category']

test_files_df = pandas.read_csv(TEST_FILES_PATH)

with open(INFERENCE_LOG_LOCAL_PATH) as file:
  data = []

  for line in file:
    # Remove the trailing newline character
    line = line.rstrip()    

    # Find wav filename
    if line.startswith('Running inference on'):
      wav_filename = line.replace('Running inference on ', '')
      continue
    
    # Find predicted text
    if line.startswith('Inference took'):
      raw_hypothesis = next(file, None)
      hypothesis = process_hypothesis(raw_hypothesis)

      if wav_filename:
        test_data_loc = test_files_df.loc[test_files_df['wav_filename'] == wav_filename]
        ground_truth = test_data_loc['transcript'].iloc[0]
        
        category = Path(wav_filename).parts[-2]

      logger.debug(f'wav_filename: {wav_filename}, hypothesis: {hypothesis}, ground_truth: {ground_truth}, category: {category}')

      # if wav_filename and predicted and ground_truth:
      if predicted and ground_truth and category:
        data.append((wav_filename, hypothesis, ground_truth, category))
      else:
        logger.warn(f'Data is incomplete. Not writing to the result file')

      wav_filename = None
      hypothesis = None
      ground_truth = None

    df = pandas.DataFrame(data=data, columns=RESULTS_FILE_COLUMNS)
    df.to_csv(INFERENCE_RESULTS_PATH, index=False)

# Evaluation

## Run evaluation

In [194]:
def levenshtein(a, b):
  "Calculates the Levenshtein distance between a and b."
  n, m = len(a), len(b)
  if n > m:
    # Make sure n <= m, to use O(min(n,m)) space
    a, b = b, a
    n, m = m, n

  current = list(range(n+1))
  for i in range(1, m+1):
    previous, current = current, [i]+[0]*n
    for j in range(1, n+1):
      add, delete = previous[j]+1, current[j-1]+1
      change = previous[j-1]
      if a[j-1] != b[i-1]:
        change = change + 1
      current[j] = min(add, delete, change)

  return current[n]

def compute_cer(ground_truth, hypothesis):
  ground_truth_str = ''.join(ground_truth)
  hypothesis_str = ''.join(hypothesis)
  return levenshtein(ground_truth_str, hypothesis_str) / len(ground_truth_str)

In [195]:
import jiwer
import json
import pandas

CATEGORIES = ['all', 'female_40', 'male_40', 'indian_male_40']

evaluation = {}

results_df = pandas.read_csv(INFERENCE_RESULTS_PATH)

for category in CATEGORIES:
  if category == 'all':
    rows = results_df
  else:
    rows = results_df[results_df['category'].eq(category)]

  ground_truth = rows['ground_truth'].tolist()
  hypothesis = rows['hypothesis'].tolist()
  
  measures = jiwer.compute_measures(ground_truth, hypothesis)
  measures['cer'] = compute_cer(ground_truth, hypothesis)
  evaluation[category] = measures

logger.info('Evaluation:')
logger.info(json.dumps(evaluation, indent=2))

with open(EVALUATION_RESULTS_PATH, 'w') as file:
  json.dump(evaluation, file, indent=2)

[2020-12-17 01:47:07,165 - logger baseline - INFO] Evaluation:
[2020-12-17 01:47:07,166 - logger baseline - INFO] {
  "all": {
    "wer": 0.8226744186046512,
    "mer": 0.8101145038167938,
    "wil": 0.959902756514625,
    "wip": 0.040097243485375,
    "hits": 199,
    "substitutions": 742,
    "deletions": 91,
    "insertions": 16,
    "cer": 0.36133333333333334
  },
  "female_40": {
    "wer": 0.8313953488372093,
    "mer": 0.8194842406876791,
    "wil": 0.9633720930232558,
    "wip": 0.03662790697674419,
    "hits": 63,
    "substitutions": 247,
    "deletions": 34,
    "insertions": 5,
    "cer": 0.3617142857142857
  },
  "male_40": {
    "wer": 0.7994186046511628,
    "mer": 0.7834757834757835,
    "wil": 0.9481768590295722,
    "wip": 0.05182314097042779,
    "hits": 76,
    "substitutions": 241,
    "deletions": 27,
    "insertions": 7,
    "cer": 0.31542857142857145
  },
  "indian_male_40": {
    "wer": 0.8401162790697675,
    "mer": 0.828080229226361,
    "wil": 0.967090829311