In [1]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
import csv
import os
import tensorflow as tf
print(tf.__version__)

2.1.0


In [2]:
import tensorflow as tf
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.layers import *
from tensorflow.python.keras.optimizers import Adam, SGD
from tensorflow.python.keras.activations import relu
from tensorflow.python.keras.metrics import categorical_accuracy, mean_squared_error
from tensorflow.python.keras.callbacks import BaseLogger, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.initializers import Ones, Zeros, glorot_normal
from tensorflow.python.framework import tensor_shape
# from data_genration import DataGenerator

import numpy as np



In [3]:
def clipped_relu(x):
    return relu(x, max_value=20)

def ctc_lambda_func(args):
    labels, y_pred, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

def ctc(y_true, y_pred):
    return y_pred



In [4]:
def get_speech_model():
    model = Sequential()
    
    # Batch normalize the input
    model.add(BatchNormalization(axis=-1, input_shape=(None, 128), name='BN_1'))
    
    # 1D Convs
    model.add(Conv1D(512, 5, strides=1, activation=clipped_relu, name='Conv1D_1'))
    model.add(Conv1D(512, 5, strides=1, activation=clipped_relu, name='Conv1D_2'))
    model.add(Conv1D(512, 5, strides=2, activation=clipped_relu, name='Conv1D_3'))
    
    # Batch Normalization
    model.add(BatchNormalization(axis=-1, name='BN_2'))
    
    # BiRNNs
    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_1'), merge_mode='sum'))
    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_2'), merge_mode='sum'))
    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_3'), merge_mode='sum'))
    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_4'), merge_mode='sum'))
    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_5'), merge_mode='sum'))
    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_6'), merge_mode='sum'))
    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_7'), merge_mode='sum'))
    
    # Batch Normalization
    model.add(BatchNormalization(axis=-1, name='BN_3'))
    
    # FC
    model.add(TimeDistributed(Dense(1024, activation=clipped_relu, name='FC1')))
    model.add(TimeDistributed(Dense(29, activation='softmax', name='y_pred')))
    return model

def get_trainable_speech_model():
    model = get_speech_model()
    y_pred = model.outputs[0]
    model_input = model.inputs[0]
    
    model.summary()
    
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    loss_out = Lambda(ctc_lambda_func, name='ctc')([labels, y_pred, input_length, label_length])
    trainable_model = Model(inputs=[model_input, labels, input_length, label_length], outputs=loss_out)
    return trainable_model

In [5]:
model = get_trainable_speech_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
BN_1 (BatchNormalization)    (None, None, 128)         512       
_________________________________________________________________
Conv1D_1 (Conv1D)            (None, None, 512)         328192    
_________________________________________________________________
Conv1D_2 (Conv1D)            (None, None, 512)         1311232   
_________________________________________________________________
Conv1D_3 (Conv1D)            (None, None, 512)         1311232   
_________________________________________________________________
BN_2 (BatchNormalization)    (None, None, 512)         2048      
_________________________________________________________________
bidirectional (Bidirectional (None, None, 1280)        4590080   
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 1280)        6

In [6]:
os.getcwd()

'/home/varun/Desktop/speechReco'

In [7]:
path = '/home/varun/Desktop/speechReco/data'
os.chdir(path)

In [8]:
os.getcwd()

'/home/varun/Desktop/speechReco/data'

In [9]:
import librosa   #for audio processing
import IPython.display as ipd
import matplotlib.pyplot as plt
import soundfile as sf
import numpy as np
from scipy.io import wavfile #for audio processing
import warnings
warnings.filterwarnings("ignore")

Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [10]:
def graph_spectrogram(wav_file):
    rate, data = get_wav_info(wav_file)
    nfft = 200 # Length of each window segment
    fs = 8000 # Sampling frequencies
    noverlap = 120 # Overlap between windows
    nchannels = data.ndim
    if nchannels == 1:
        pxx, freqs, bins, im = plt.specgram(data, nfft, fs, noverlap = noverlap)
    elif nchannels == 2:
        pxx, freqs, bins, im = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
    return pxx

# Load a wav file
def get_wav_info(wav_file):
    data,rate = sf.read(wav_file)
    return rate, data


In [None]:
labels = []
file_names = []
samples = []

for direc in os.listdir(path):
    for subdir in os.listdir(path +"/" + direc):
        file = [ f for f in os.listdir(path + '/' + direc + "/" + subdir) if f.endswith('.txt')]
        for f in file:
            with open(path + "/" + direc + "/" + subdir  +'/' +f ,'r') as txtfile:
                data = txtfile.read()
                sentence = data.lower().split('\n')
                for sent in sentence:
                    sent = sent.split(" ",1)
                    if len(sent) == 2:
                        file_names.append(sent[0])
                        sample = graph_spectrogram(path + "/" + direc + "/" + subdir  +'/'+ sent[0] +'.flac')
                        samples.append(sample)
                        labels.append(sent[1])
                        
                    elif len(samples) > 10:
                        break
                    else:
                        continue
                        