<a href="https://colab.research.google.com/github/xiaoyufan/speech-data-augmentation/blob/main/train_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess data

## Setup

### Create logger

In [81]:
from importlib import reload
import logging
import sys

reload(logging)

LOGGING_LEVEL = 'DEBUG'

logger = logging.getLogger('baseline')
logger.setLevel(LOGGING_LEVEL)

formatter = logging.Formatter('[%(asctime)s - logger %(name)s - %(levelname)s] %(message)s')

ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(formatter)
logger.addHandler(ch)

logger.debug('debug test')
logger.info('info test')

[2020-12-09 21:45:01,643 - logger baseline - DEBUG] debug test
[2020-12-09 21:45:01,644 - logger baseline - INFO] info test


### Mount Google Drive

In [82]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Configurations

#### Configure mode

In [83]:
MODES = {
  'PLAYGROUND': 'playground',
  'PRODUCTION': 'production',
}
MODE =  MODES['PRODUCTION'] # Can be one of 'PRODUCTION' and 'PLAYGROUND'
logger.info(f'Notebook runs in mode {MODE}.')

[2020-12-09 21:45:03,113 - logger baseline - INFO] Notebook runs in mode production.


#### Get notebook's start time

In [84]:
from datetime import datetime, tzinfo
import pytz

NB_RUN_TIME = datetime.now(tz=pytz.timezone('US/Eastern')).strftime('%Y%m%d-%H%M%S')
logger.info(f'Notebook started at {NB_RUN_TIME}.')

[2020-12-09 21:45:03,121 - logger baseline - INFO] Notebook started at 20201209-164503.


#### Other configurations

In [85]:
DEEPSPEECH_LOG_LEVEL = '1'
DEEPSPEECH_PATH = '/content/DeepSpeech'
DRIVE_PROJECT_ROOT_PATH = '/content/drive/MyDrive/nlp-project'
DATASET_PATH = f'{DRIVE_PROJECT_ROOT_PATH}/tensorflow-speech-recognition-challenge'
USE_GPU = True

#### Set input and output paths

In [86]:
TRAIN_DIR = 'train'
TEST_DIR = 'test'
PLAYGROUND_DIR = 'playground'

if MODE == MODES['PRODUCTION']:
  TRAIN_FILES_PATH = f'{DATASET_PATH}/{TRAIN_DIR}/training_files.csv'
  TEST_FILES_PATH = f'{DATASET_PATH}/{TEST_DIR}/testing_files.csv'
  
  # For now, we don't use test/audio because its audio files don't have transcriptions.
  # Instead, we split train/audio into training set and testing set.
  WAV_TRAIN_DIR = f'{DATASET_PATH}/{TRAIN_DIR}/audio'

  OUTPUT_PATH = f'{DRIVE_PROJECT_ROOT_PATH}/xiaoyu-baseline/{NB_RUN_TIME}'
else:
  TRAIN_FILES_PATH = f'{DATASET_PATH}/{PLAYGROUND_DIR}/training_files.csv'
  TEST_FILES_PATH = f'{DATASET_PATH}/{PLAYGROUND_DIR}/testing_files.csv'
  
  # For now, we don't use test/audio because its audio files don't have transcriptions.
  # Instead, we split train/audio into training set and testing set.
  WAV_TRAIN_DIR = f'{DATASET_PATH}/{PLAYGROUND_DIR}/audio'

  OUTPUT_PATH = f'{DRIVE_PROJECT_ROOT_PATH}/xiaoyu-playground/{NB_RUN_TIME}'

### Install packages

In [87]:
%%bash -s "$DEEPSPEECH_PATH"
DEEPSPEECH_PATH=$1

if [ ! -d "$DEEPSPEECH_PATH" ] ; then
  git clone --branch v0.9.2 https://github.com/mozilla/DeepSpeech $DEEPSPEECH_PATH
fi

cd $DEEPSPEECH_PATH
pip install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0
pip install --upgrade -e .

# pip uninstall tensorflow -y
pip install --upgrade tensorflow==1.15.4
pip install tensorflow-gpu==1.15.4

# Install other packages
pip install pandas
# tensorflow 1.15.4 requires numpy<1.19.0,>=1.16.0, but you'll have numpy 1.19.4 which is incompatible.
pip install --upgrade numpy==1.16.0

Requirement already up-to-date: pip==20.2.2 in /usr/local/lib/python3.6/dist-packages (20.2.2)
Requirement already up-to-date: wheel==0.34.2 in /usr/local/lib/python3.6/dist-packages (0.34.2)
Requirement already up-to-date: setuptools==49.6.0 in /usr/local/lib/python3.6/dist-packages (49.6.0)
Obtaining file:///content/DeepSpeech
Installing collected packages: deepspeech-training
  Attempting uninstall: deepspeech-training
    Found existing installation: deepspeech-training 0.9.2
    Can't uninstall 'deepspeech-training'. No files were found to uninstall.
  Running setup.py develop for deepspeech-training
Successfully installed deepspeech-training
Requirement already up-to-date: tensorflow==1.15.4 in /usr/local/lib/python3.6/dist-packages (1.15.4)
Requirement already up-to-date: numpy==1.16.0 in /usr/local/lib/python3.6/dist-packages (1.16.0)


### Verify tensorflow can run on GPU

In [88]:
import tensorflow as tf
logger.info(f'tensorflow version: {tf.__version__}')

if tf.test.gpu_device_name(): 
    logger.info('Using Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
   logger.info("Not using GPU")

[2020-12-09 21:45:22,168 - logger baseline - INFO] tensorflow version: 1.15.4
[2020-12-09 21:45:22,186 - logger baseline - INFO] Using Default GPU Device: /device:GPU:0


## Transform tensorflow-challenge dataset into DeepSpeech format

In [89]:
# Importer

import os
import sys
import pandas

from pathlib import Path


def load_testing_files():
    testing_list_path = f'{DATASET_PATH}/train/testing_list.txt'

    with open(testing_list_path) as file:
        testing_files = [line.rstrip() for line in file]
        return testing_files


def generate_files_list(testing_files):
    if (os.path.exists(TRAIN_FILES_PATH) and
            os.path.exists(TEST_FILES_PATH)):
        logger.info(f'Skipping transforming data. Data files {TRAIN_FILES_PATH} and {TEST_FILES_PATH} already exist. ')
        return

    COLUMNS = ['wav_filename', 'wav_filesize', 'transcript']
    training_data = []
    testing_data = []

    for path in Path(WAV_TRAIN_DIR).rglob('*.wav'):
        wav_path_relative_to_wav_dir = str(path.relative_to(WAV_TRAIN_DIR))
        wav_set = None

        if wav_path_relative_to_wav_dir in testing_files:
            wav_set = 'testing'
        else:
            wav_set = 'training'

        wav_filename = path.name
        transcript = wav_path_relative_to_wav_dir.split('/')[0]

        logger.debug(
            f'Wav: {wav_path_relative_to_wav_dir}; Dataset: {wav_set}; Transcript: {transcript}')

        data = (str(path),
                path.stat().st_size,
                transcript)

        if wav_set == 'training':
            if transcript == '_background_noise_':
                logger.debug(f'SKipping adding {wav_filename} to files list.')
                continue

            training_data.append(data)
            
        if wav_set == 'testing':
            testing_data.append(data)

    training_df = pandas.DataFrame(
        data=training_data,
        columns=COLUMNS,
    )
    training_df.to_csv(os.path.join(TRAIN_FILES_PATH), index=False)
    logger.info(f'Train data files generated at {TRAIN_FILES_PATH}.')

    testing_df = pandas.DataFrame(
        data=testing_data,
        columns=COLUMNS,
    )
    testing_df.to_csv(os.path.join(TEST_FILES_PATH), index=False)
    logger.info(f'Test data files generated at {TRAIN_FILES_PATH}.')


def transform_data():
    # Load testing files list
    testing_files = load_testing_files()
    # Generate files list
    generate_files_list(testing_files)


transform_data()

[2020-12-09 21:45:22,249 - logger baseline - INFO] Skipping transforming data. Data files /content/drive/MyDrive/nlp-project/tensorflow-speech-recognition-challenge/train/training_files.csv and /content/drive/MyDrive/nlp-project/tensorflow-speech-recognition-challenge/test/testing_files.csv already exist. 


# Train a baseline model

Train a DeepSpeech model with Kaggle Tensorflow challenge's dataset to establish the baseline.

In [None]:
%%bash -s "$DEEPSPEECH_PATH" "$TRAIN_FILES_PATH" "$TEST_FILES_PATH" "$OUTPUT_PATH" "$DEEPSPEECH_LOG_LEVEL"
DEEPSPEECH_PATH=$1
TRAIN_FILES_PATH=$2
TEST_FILES_PATH=$3
OUTPUT_PATH=$4
DEEPSPEECH_LOG_LEVEL=$5

cd $DEEPSPEECH_PATH
python DeepSpeech.py \
  --alphabet_config_path data/alphabet.txt \
  --train_files "$TRAIN_FILES_PATH" \
  --test_files "$TEST_FILES_PATH" \
  --checkpoint_dir "$OUTPUT_PATH/checkpoints" \
  --export_dir "$OUTPUT_PATH/models" \
  --epochs 5 \
  --log_level "$DEEPSPEECH_LOG_LEVEL" \
  --test_output_file "$OUTPUT_PATH/test-output.txt" \
  --summary_dir "$OUTPUT_PATH/summary"