# Preprocessing data

## Install packages

In [1]:
!pip install python-dotenv
!pip install pandas

You should consider upgrading via the '/home/blake/anaconda3/envs/deepspeech-train/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/blake/anaconda3/envs/deepspeech-train/bin/python -m pip install --upgrade pip' command.[0m


## Load environment variables

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

DATASET_PATH = os.getenv('DATASET_PATH')
DEEPSPEECH_LOG_LEVEL = os.getenv('DEEPSPEECH_LOG_LEVEL')
DEEPSPEECH_PATH = os.getenv('DEEPSPEECH_PATH')
LOGGING_LEVEL = os.getenv('LOGGING_LEVEL')
USE_GPU = bool(os.getenv('USE_GPU'))

## Create logger

In [3]:
from importlib import reload
import logging
import sys

reload(logging)

logger = logging.getLogger('baseline')
logger.setLevel(LOGGING_LEVEL)

formatter = logging.Formatter('[%(asctime)s - logger %(name)s - %(levelname)s] %(message)s')

ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(formatter)
logger.addHandler(ch)

logger.debug('debug test')
logger.info('info test')

[2020-12-08 13:35:07,850 - logger baseline - DEBUG] debug test
[2020-12-08 13:35:07,852 - logger baseline - INFO] info test


## Test GPU

In [4]:
import tensorflow as tf
logger.info(f'tensorflow version: {tf.__version__}')

if tf.test.gpu_device_name(): 
    logger.info('Using Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
   logger.info("Not using GPU")

[2020-12-08 13:35:09,144 - logger baseline - INFO] tensorflow version: 1.15.4
[2020-12-08 13:35:09,152 - logger baseline - INFO] Not using GPU


## Transform tensorflow-challenge dataset into DeepSpeech format

In [5]:
TRAIN_FILES_PATH = f'{DATASET_PATH}/train/training_files.csv'
TEST_FILES_PATH = f'{DATASET_PATH}/test/testing_files.csv'

# For now, we don't use test/audio because its audio files don't have transcriptions.
# Instead, we split train/audio into training set and testing set.
WAV_TRAIN_DIR = f'{DATASET_PATH}/train/audio'

In [6]:
# Importer

import os
import sys
import pandas

from pathlib import Path


def load_testing_files():
    testing_list_path = f'{DATASET_PATH}/train/testing_list.txt'

    with open(testing_list_path) as file:
        testing_files = [line.rstrip() for line in file]
        return testing_files


def generate_files_list(testing_files):
    if (os.path.exists(TRAIN_FILES_PATH) and
            os.path.exists(TEST_FILES_PATH)):
        logger.info(f'Skipping transforming data. Data files {TRAIN_FILES_PATH} and {TEST_FILES_PATH} already exist. ')
        return

    COLUMNS = ['wav_filename', 'wav_filesize', 'transcript']
    training_data = []
    testing_data = []

    for path in Path(WAV_TRAIN_DIR).rglob('*.wav'):
        wav_path_relative_to_wav_dir = str(path.relative_to(WAV_TRAIN_DIR))
        wav_set = None

        if wav_path_relative_to_wav_dir in testing_files:
            wav_set = 'testing'
        else:
            wav_set = 'training'

        wav_filename = path.name
        transcript = wav_path_relative_to_wav_dir.split('/')[0]

        logger.debug(
            f'Wav: {wav_path_relative_to_wav_dir}; Dataset: {wav_set}; Transcript: {transcript}')

        data = (str(path),
                path.stat().st_size,
                transcript)

        if wav_set == 'training':
            if transcript == '_background_noise_':
                logger.debug(f'SKipping adding {wav_filename} to files list.')
                continue

            training_data.append(data)
            
        if wav_set == 'testing':
            testing_data.append(data)

    training_df = pandas.DataFrame(
        data=training_data,
        columns=COLUMNS,
    )
    training_df.to_csv(os.path.join(TRAIN_FILES_PATH), index=False)
    logger.info(f'Train data files generated at {TRAIN_FILES_PATH}.')

    testing_df = pandas.DataFrame(
        data=testing_data,
        columns=COLUMNS,
    )
    testing_df.to_csv(os.path.join(TEST_FILES_PATH), index=False)
    logger.info(f'Test data files generated at {TRAIN_FILES_PATH}.')


def transform_data():
    # Load testing files list
    testing_files = load_testing_files()
    # Generate files list
    generate_files_list(testing_files)


transform_data()

[2020-12-08 13:35:09,192 - logger baseline - INFO] Skipping transforming data. Data files /mnt/c/Users/tohru/Desktop/Accented Speech Recognition/Project - Accented Speech Recognition/Dataset/tensorflow-speech-recognition-challenge/train/training_files.csv and /mnt/c/Users/tohru/Desktop/Accented Speech Recognition/Project - Accented Speech Recognition/Dataset/tensorflow-speech-recognition-challenge/test/testing_files.csv already exist. 


In [None]:
!python "$DEEPSPEECH_PATH/DeepSpeech.py" --alphabet_config_path="$DEEPSPEECH_PATH/data/alphabet.txt" --train_files "$TRAIN_FILES_PATH" --test_files "$TEST_FILES_PATH" --log_level "$DEEPSPEECH_LOG_LEVEL"

2020-12-08 13:35:10.784901: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-12-08 13:35:10.789517: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2592005000 Hz
2020-12-08 13:35:10.790930: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55b8efcda470 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-12-08 13:35:10.790972: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
Instructions for updating:
Use `tf.compat.v1.data.get_output_types(iterator)`.
W1208 13:35:12.203429 140435406763840 deprecation.py:323] From /home/blake/anaconda3/envs/deepspeech-train/lib/python3.6/site-packages/tensorflow_core/python/data/ops/iterator_ops.py:347: Iterator.output_types (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version

I Loading variable from checkpoint: layer_6/weights/Adam_1
I Loading variable from checkpoint: learning_rate
I STARTING Optimization
Epoch 0 |   Training | Elapsed Time: 0:16:03 | Steps: 256 | Loss: 12.348753    