<a href="https://colab.research.google.com/github/xiaoyufan/speech-data-augmentation/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess data

## Setup

### Create logger

In [102]:
from importlib import reload
import logging
import sys

reload(logging)

LOGGING_LEVEL = 'INFO'

logger = logging.getLogger('baseline')
logger.setLevel(LOGGING_LEVEL)

formatter = logging.Formatter('[%(asctime)s - logger %(name)s - %(levelname)s] %(message)s')

ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(formatter)
logger.addHandler(ch)

logger.debug('debug test')
logger.info('info test')

[2020-12-16 23:57:17,912 - logger baseline - INFO] info test


### Mount Google Drive

In [103]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Configurations

#### Get notebook's start time

In [104]:
from datetime import datetime, tzinfo
import pytz

NB_RUN_TIME = datetime.now(tz=pytz.timezone('US/Eastern')).strftime('%Y%m%d-%H%M%S')
logger.info(f'Notebook started at {NB_RUN_TIME}.')

[2020-12-16 23:57:20,857 - logger baseline - INFO] Notebook started at 20201216-185720.


#### Hyperparameters

In [105]:
TRAIN_BATCH_SIZE=8
EPOCHS=80

#### Paths

In [106]:
EXPERIMENT_NAME = 'indian_added'

PROJECT_ROOT_PATH = '/content/drive/MyDrive/nlp-project'
DATASET_PATH = f'{PROJECT_ROOT_PATH}/cmu_arctic'

TRAIN_DIR = f'{DATASET_PATH}/train_{EXPERIMENT_NAME}'
TRAIN_FILES_PATH = f'{TRAIN_DIR}/train_files.csv'
WAV_TRAIN_DIR = f'{TRAIN_DIR}/audio'

TEST_DIR = f'{DATASET_PATH}/test'
TEST_FILES_PATH = f'{TEST_DIR}/test_files.csv'
WAV_TEST_DIR = f'{TEST_DIR}/audio'

OUTPUT_PATH = f'{PROJECT_ROOT_PATH}/xiaoyu-output/{EXPERIMENT_NAME}/{NB_RUN_TIME}-b{TRAIN_BATCH_SIZE}-e{EPOCHS}'

DEEPSPEECH_PATH = '/content/DeepSpeech'

#### Other configurations

In [107]:
DEEPSPEECH_LOG_LEVEL = '1'
FORCE_TRANSFORM_DATA = False

### Install packages

In [108]:
%%bash -s "$DEEPSPEECH_PATH"
DEEPSPEECH_PATH=$1

if [ ! -d "$DEEPSPEECH_PATH" ] ; then
  git clone --branch v0.9.2 https://github.com/mozilla/DeepSpeech $DEEPSPEECH_PATH
fi

cd $DEEPSPEECH_PATH
pip install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0
pip install --upgrade -e .

# pip uninstall tensorflow -y
pip install --upgrade tensorflow==1.15.4
pip install tensorflow-gpu==1.15.4

# Install other packages
pip install pandas
# tensorflow 1.15.4 requires numpy<1.19.0,>=1.16.0, but you'll have numpy 1.19.4 which is incompatible.
pip install --upgrade numpy==1.16.0

Requirement already up-to-date: pip==20.2.2 in /usr/local/lib/python3.6/dist-packages (20.2.2)
Requirement already up-to-date: wheel==0.34.2 in /usr/local/lib/python3.6/dist-packages (0.34.2)
Requirement already up-to-date: setuptools==49.6.0 in /usr/local/lib/python3.6/dist-packages (49.6.0)
Obtaining file:///content/DeepSpeech
Installing collected packages: deepspeech-training
  Attempting uninstall: deepspeech-training
    Found existing installation: deepspeech-training 0.9.2
    Can't uninstall 'deepspeech-training'. No files were found to uninstall.
  Running setup.py develop for deepspeech-training
Successfully installed deepspeech-training
Requirement already up-to-date: tensorflow==1.15.4 in /usr/local/lib/python3.6/dist-packages (1.15.4)
Requirement already up-to-date: numpy==1.16.0 in /usr/local/lib/python3.6/dist-packages (1.16.0)


### Check tensorflow version and if it runs on GPU

In [109]:
import tensorflow as tf
logger.info(f'tensorflow version: {tf.__version__}')

if tf.test.gpu_device_name(): 
    logger.info('Using Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
   logger.info("Not using GPU")

[2020-12-16 23:57:37,511 - logger baseline - INFO] tensorflow version: 1.15.4
[2020-12-16 23:57:37,522 - logger baseline - INFO] Using Default GPU Device: /device:GPU:0


## Transform CMU arctic dataset into DeepSpeech format

### Preprocess transcripts

In [110]:
%%bash

pip install jiwer



In [111]:
import jiwer
import re

transformation = jiwer.Compose([
  jiwer.ToLowerCase(),
  jiwer.RemoveWhiteSpace(replace_by_space=True),
  jiwer.RemoveMultipleSpaces(),
  jiwer.Strip(),
]) 

PUNCTUATIONS_TO_REMOVE = re.compile(r'[!"#$%&()*\+,-./\\:;<=>?@\[\]^_`{|}~]')


def preprocess_transcript(raw):
  processed = re.sub(PUNCTUATIONS_TO_REMOVE, ' ', raw)
  processed = transformation(processed)
  return processed

### Generate data files

In [112]:
# Importer

import os
import pandas
import sys

from pathlib import Path


def load_data_file():
  data_file_path = f'{DATASET_PATH}/cmuarctic.data.txt'

  with open(data_file_path) as file:
    data_file = {}

    for line in file:
      wav_filename = (line.split('"')[0]).split(' ')[1]
      transcript = line.split('"')[1]
      data_file[wav_filename] = transcript

    return data_file


def generate_files_list(wav_dir, output_path):
  COLUMNS = ['wav_filename', 'wav_filesize', 'transcript']
  files_list_data = []

  data_file = load_data_file()

  for path in Path(wav_dir).rglob('*.wav'):
    wav_path_relative_to_wav_dir = str(path.relative_to(wav_dir))
    wav_filename = path.name
    raw_transcript = data_file[wav_filename.replace(path.suffix, '')]
    transcript = preprocess_transcript(raw_transcript) 

    logger.debug(f'Wav: {wav_path_relative_to_wav_dir}; Transcript: {transcript}')

    file_data = (str(path), path.stat().st_size, transcript)
    files_list_data.append(file_data)
          
  df = pandas.DataFrame(data=files_list_data, columns=COLUMNS)
  df.to_csv(os.path.join(output_path), index=False)


def transform_data():
    # Generate files list
    if (os.path.exists(TRAIN_FILES_PATH) and not FORCE_TRANSFORM_DATA):
      logger.info(f'Skipping transforming train data. Data file {TRAIN_FILES_PATH} already exist')
    else:
      generate_files_list(wav_dir=WAV_TRAIN_DIR, output_path=TRAIN_FILES_PATH)
      logger.info(f'Train data files generated at {TRAIN_FILES_PATH}')

    if (os.path.exists(TEST_FILES_PATH) and not FORCE_TRANSFORM_DATA):
      logger.info(f'Skipping transforming test data. Data file {TEST_FILES_PATH} already exist')
    else:
      generate_files_list(wav_dir=WAV_TEST_DIR, output_path=TEST_FILES_PATH)
      logger.info(f'Test data files generated at {TEST_FILES_PATH}')

In [113]:
transform_data()

[2020-12-16 23:57:44,206 - logger baseline - INFO] Train data files generated at /content/drive/MyDrive/nlp-project/cmu_arctic/train_indian_added/train_files.csv.
[2020-12-16 23:57:44,306 - logger baseline - INFO] Test data files generated at /content/drive/MyDrive/nlp-project/cmu_arctic/test/test_files.csv.


# Train a baseline model

Train a DeepSpeech model with Kaggle Tensorflow challenge's dataset to establish the baseline.

In [114]:
%%bash -s "$DEEPSPEECH_PATH" "$TRAIN_FILES_PATH" "$TEST_FILES_PATH" "$OUTPUT_PATH" "$DEEPSPEECH_LOG_LEVEL" "$NB_RUN_TIME" "$DATASET_PATH" "$TRAIN_BATCH_SIZE" "$EPOCHS"

DEEPSPEECH_PATH=$1
TRAIN_FILES_PATH=$2
TEST_FILES_PATH=$3
OUTPUT_PATH=$4
DEEPSPEECH_LOG_LEVEL=$5
NB_RUN_TIME=$6
DATASET_PATH=$7
TRAIN_BATCH_SIZE=$8
EPOCHS=$9

cd $DEEPSPEECH_PATH

echo "===== Configurations =====
Train data files used: $TRAIN_FILES_PATH
TRAIN_BATCH_SIZE: $TRAIN_BATCH_SIZE
EPOCHS: $EPOCHS
" 2>&1 | tee -a "$NB_RUN_TIME.log"

python DeepSpeech.py \
  --alphabet_config_path "$DATASET_PATH/alphabet.txt" \
  --train_files "$TRAIN_FILES_PATH" \
  --test_files "$TEST_FILES_PATH" \
  --checkpoint_dir "$OUTPUT_PATH/checkpoints" \
  --export_dir "$OUTPUT_PATH/models" \
  --train_batch_size $TRAIN_BATCH_SIZE \
  --epochs $EPOCHS \
  --log_level "$DEEPSPEECH_LOG_LEVEL" \
  --test_output_file "$OUTPUT_PATH/test-output.txt" \
  --summary_dir "$OUTPUT_PATH/summary" 2>&1 | tee -a "$NB_RUN_TIME.log"

cp "$NB_RUN_TIME.log" "$OUTPUT_PATH/log.txt"

===== Configurations =====
Train data files used: /content/drive/MyDrive/nlp-project/cmu_arctic/train_indian_added/train_files.csv
TRAIN_BATCH_SIZE: 8
EPOCHS: 80

2020-12-16 23:57:46.504689: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
I1216 23:57:46.988025 140326600705920 utils.py:141] NumExpr defaulting to 4 threads.
Instructions for updating:
Use `tf.compat.v1.data.get_output_types(iterator)`.
W1216 23:57:47.925571 140326600705920 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/iterator_ops.py:347: Iterator.output_types (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.compat.v1.data.get_output_types(iterator)`.
Instructions for updating:
Use `tf.compat.v1.data.get_output_shapes(iterator)`.
W1216 23:57:47.92