<a href="https://colab.research.google.com/github/usef-kh/SpeechRecognition/blob/master/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load Kaggle API Token
from google.colab import files
!pip install -q kaggle > /dev/null
uploaded = files.upload()
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

# Load Data
!kaggle competitions download -c tensorflow-speech-recognition-challenge > /dev/null

# # Unzip Data
# !apt-get install p7zip-full > /dev/null
!p7zip -d train.7z


Saving kaggle.json to kaggle.json
100% 1.04G/1.04G [00:17<00:00, 62.4MB/s]
100% 2.46G/2.46G [00:52<00:00, 50.2MB/s]
100% 501k/501k [00:00<00:00, 70.4MB/s]
100% 50.0/50.0 [00:00<00:00, 48.7kB/s]

7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 1121103842 bytes (1070 MiB)

Extracting archive: train.7z
--
Path = train.7z
Type = 7z
Physical Size = 1121103842
Headers Size = 389133
Method = Delta LZMA2:24
Solid = +
Blocks = 2

  0%      0% 38 - train/audio/_background_noise_/doing_the_dishes.wav                                                               0% 40 - train/audio/_background_noise_/exercise_bike.wav

<!-- ## Useful Imports and Functions -->

In [None]:
import librosa
import librosa.display
from scipy import signal
import numpy as np
from keras import backend as K
from sklearn.model_selection import train_test_split
from scipy.io import wavfile
import time
from IPython.display import Audio
from os import walk
from matplotlib import pyplot as plt
import csv
import keras
import random
import collections
import IPython.display as ipd

Using TensorFlow backend.


### Loading Data Helper funcitons

In [None]:
def fixSize(data, nsamples=16000):
    if len(data) <= nsamples:
        data = np.pad(data, (nsamples - len(data), 0), "constant")
    
    else:

        maxID = np.argmax(np.abs(data))
        leftVal = data[0]
        rightVal = data[-1]
        ptr = maxID - nsamples//2
        res = []
        while ptr < 0:
            res.append(leftVal)
            ptr += 1

        while len(res) < nsamples and ptr < len(data):
            res.append(data[ptr])
            ptr += 1

        while len(res) < nsamples:
            res.append(rightVal)
        
        data = np.array(res).astype(np.float16)
        
    return data

In [None]:
def generateFiles():
    train_audio_path = '/content/train/audio/'

    # Load all filenames into a dictionary so we can call on them easily
    files = {}
    for (dirpath, dirnames, filenames) in walk(train_audio_path):
        files[dirpath[21:]] = filenames

    files.pop('')
    files['_background_noise_'].remove('README.md')

    return files

In [None]:
import librosa
import scipy.signal as sps
import random


def manipulatePitch(data, pitch_factor):
    return librosa.effects.pitch_shift(data.astype(np.float32), 16000, pitch_factor)

def stretch(data, rate=1):
    input_length = 16000
    data = librosa.effects.time_stretch(data.astype(np.float32), rate)
    return fixSize(data)

In [None]:
def get_wav(file_name, nsamples=16000):
    wav = wavfile.read(file_name)[1]
    return fixSize(wav)

    # if wav.size < nsamples:
    #     audio = np.pad(wav, (nsamples - wav.size, 0), mode='constant')
    # else:
    #     audio = wav[0:nsamples]
    # return audio

def get_noise(filename, nsamples=16000, stepSize = 2000):
    wav = wavfile.read(filename)[1]

    noise = []
    for i in range((len(wav)-nsamples) // stepSize):
        start = i*stepSize
        subsample = wav[start: start + nsamples]
      
        if len(subsample) < nsamples:
            subsample = np.pad(wav, (nsamples - subsample.size, 0), mode='constant')
        noise.append(subsample)
    
    return noise

In [None]:
def loadData(files, withNoise = True):
    train_audio_path = '/content/train/audio/'
    labels = set(['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'])

    noiseChunks = []
    for filename in files['_background_noise_']:
        noise = get_noise(train_audio_path + '_background_noise_' + '/' + filename)
        noiseChunks.append(noise)
        
    files.pop('_background_noise_')

    print("Loading and augmenting all classes")
    choices = [0.8, 1.2]
    xtrain, ytrain = [], []
    for label in ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']:
        
        for i, filename in enumerate(files[label]):
            audio = get_wav(train_audio_path + label + '/' + filename)
            xtrain.append(audio)
            ytrain.append(label)

            if random.random() > 0.5:
                # Data Augmentation thorugh adding noise           
                chunk = noiseChunks[i % 6]
                idx = random.randint(0, len(chunk) - 1)
                noisy = audio + chunk[idx]*random.random()/10
                
                xtrain.append(noisy)
                ytrain.append(label)

            # if random.random() > 0.5:
            #     # Data Augmentation though changing speed
            #     streched = stretch(audio, choices[i % 2])
            #     xtrain.append(streched)
            #     ytrain.append(label)

            # print(i)
        files.pop(label)

    print("Loading 'unknown' class")
    for label, filenames in files.items():
        for filename in filenames:
            audio = get_wav(train_audio_path + label + '/' + filename)
            xtrain.append(audio)
            ytrain.append('unknown')

    print("Adding 'silence' class")
    for chunk in noiseChunks:
        xtrain.extend(chunk)
        ytrain.extend(['silence']*len(chunk))
    
    return np.array(xtrain, dtype=np.float16), np.array(ytrain)



    # rand_noise = np.array([0]*16000)
                # for chunk in noiseChunks:
                #     idx = random.randint(0, len(chunk) - 1)
                #     noise_sample = chunk[idx]

                # rand_noise += noise_sample


### Preprocessing

In [None]:
def onehot(A, mapping=None):
  labels = set(['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown'])

  if mapping is None:
    mapping = {}
    maptolable = {}
    for i, label in enumerate(labels):
      temp = [0] * len(labels)
      temp[i] = 1
      mapping[label] = temp
      maptolable[i] = label
      
  res = []
  for label in A:
    res.append(mapping[label])
  
  return np.array(res), mapping, maptolable

### Other Functions

In [None]:
def f1(y_true, y_pred):
  """Macro F1 Score

  A custom metric function that computes the average of the f1 scores 
  across all classesin a multiclass classificaiton problem
  """
  def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

  def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
    
  precision = precision(y_true, y_pred)
  recall = recall(y_true, y_pred)
  return 2*((precision * recall)/(precision + recall + K.epsilon()))

In [None]:
def swish(x, beta = 1):
    return (x * K.sigmoid(beta * x))

# Getting the Custom object and updating them 
from keras.utils.generic_utils import get_custom_objects 
from keras.layers import Activation 
  
# Below in place of swish you can take any custom key for the name  
get_custom_objects().update({'swish': Activation(swish)}) 

In [None]:
def performance_curves(history, metrics):
  def generate_plot(metric):
    train = history.history[metric]
    val = history.history['val_' + metric]
    x_axis = range(1, len(history.history[metric])+1)

    plt.figure()
    plt.plot(x_axis, train, label="Training " + metric)
    plt.plot(x_axis, val, label="Validation " + metric)

    plt.ylabel(metric)
    plt.xlabel('Epochs')
    plt.title('Epochs vs ' + metric)
    plt.legend()
    plt.show()

    if metric == 'loss':
      print('Minimum Validation Loss is:', str(min(val)))
      print('Epoch: ', str(val.index(min(val)) + 1))
    else:
      print('Maximum', metric, 'is:', str(max(val)))
      print('Epoch: ', str(val.index(max(val)) + 1))
  
  for metric in metrics:
    generate_plot(metric)

## Generating Dataset

### Loading samples

In [None]:
files = generateFiles()
X, Y = loadData(files)

debug = True
if debug:
 
  print(len(X), len(Y))
  
  count = 0
  for label, samples in files.items():
    print(label, '\t', len(samples))
    count += len(samples)

  print("Number of Audio signals:", count)

  if sys.path[0] == '':


Loading and augmenting all classes
Loading 'unknown' class
Adding 'silence' class
79849 79849
dog 	 1746
sheila 	 1734
bed 	 1713
two 	 2373
marvin 	 1746
six 	 2369
one 	 2370
nine 	 2364
four 	 2372
zero 	 2376
three 	 2356
bird 	 1731
seven 	 2377
happy 	 1742
cat 	 1733
tree 	 1733
wow 	 1745
house 	 1750
eight 	 2352
five 	 2357
Number of Audio signals: 41039


In [None]:
import collections

print(len(X), len(Y))

count = collections.defaultdict(int)
for y in Y:
  count[y] += 1

for label, c in count.items():
  print(label, c)
    

79849 79849
yes 3613
no 3580
up 3594
down 3544
left 3532
right 3580
on 3500
off 3531
stop 3562
go 3628
unknown 41039
silence 3146


In [None]:
Xfinal = X.reshape(-1, 16000, 1)

Yonehot, mapping, maptolable = onehot(Y)
Yonehot = Yonehot.reshape(-1, 1, 12)

xtrain, xval, ytrain, yval = train_test_split(Xfinal, Yonehot, test_size=0.2, random_state=127)

print('Number of Dimensions:', X.ndim)
print('Dataset')
print('\t', Xfinal.shape, '\t', Yonehot.shape)

print('\nTraining Data')
print('\t', xtrain.shape, '\t', ytrain.shape)

print('\nValidation Data')
print('\t', xval.shape, '\t', yval.shape)

Number of Dimensions: 2
Dataset
	 (79849, 16000, 1) 	 (79849, 1, 12)

Training Data
	 (63879, 16000, 1) 	 (63879, 1, 12)

Validation Data
	 (15970, 16000, 1) 	 (15970, 1, 12)


In [None]:
def printCount(labels):
    count = collections.defaultdict(int)
    for i, y in enumerate(labels):
        idx = np.where(y[0] == 1)[0][0]
        count[maptolable[idx]] += 1

    res = []
    for label, c in count.items():
        res.append((label, c))
    
    for label, c in sorted(res, key = lambda x: x[1]):
        print(label, '\t', c)
    print('')

printCount(Yonehot)
printCount(ytrain)
printCount(yval)

silence 	 3146
on 	 3500
off 	 3531
left 	 3532
down 	 3544
stop 	 3562
no 	 3580
right 	 3580
up 	 3594
yes 	 3613
go 	 3628
unknown 	 41039

silence 	 2513
off 	 2795
on 	 2814
left 	 2818
down 	 2825
right 	 2848
up 	 2862
no 	 2867
stop 	 2879
yes 	 2897
go 	 2905
unknown 	 32856

silence 	 633
stop 	 683
on 	 686
no 	 713
left 	 714
yes 	 716
down 	 719
go 	 723
right 	 732
up 	 732
off 	 736
unknown 	 8183



## Implementing CNN

### Building & Training Model

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, Flatten, Dropout, BatchNormalization, Conv1D, MaxPooling1D
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import TensorBoard
from keras.regularizers import l2

model = Sequential()
model.add(Conv1D(32, kernel_size=10, strides = 4, activation = 'swish', input_shape = xtrain.shape[1:]))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))
model.add(Dropout(0.1))

model.add(Conv1D(128, kernel_size=1, strides = 1, activation = 'swish'))
model.add(BatchNormalization())
model.add(Dropout(0.1))

model.add(Conv1D(64, kernel_size=10, strides = 4, activation = 'swish'))
model.add(BatchNormalization())
model.add(MaxPooling1D(3))
model.add(Dropout(0.1))

model.add(Conv1D(128, kernel_size=1, strides = 1, activation = 'swish'))
model.add(BatchNormalization())
model.add(Dropout(0.1))

model.add(Conv1D(128, kernel_size=10, strides = 4, activation = 'swish'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))
model.add(Dropout(0.1))

model.add(Conv1D(128, kernel_size=1, strides = 1, activation = 'swish'))
model.add(BatchNormalization())
model.add(Dropout(0.1))

model.add(Conv1D(256, kernel_size=10, strides = 4, activation = 'swish'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))
model.add(Dropout(0.1))

model.add(Dense(64, activation = 'swish'))
model.add(Dense(len(mapping), activation = 'softmax'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[f1, 'accuracy'])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_8 (Conv1D)            (None, 3998, 32)          352       
_________________________________________________________________
batch_normalization_8 (Batch (None, 3998, 32)          128       
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 1999, 32)          0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 1999, 32)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 1999, 128)         4224      
_________________________________________________________________
batch_normalization_9 (Batch (None, 1999, 128)         512       
_________________________________________________________________
dropout_9 (Dropout)          (None, 1999, 128)        

In [None]:
history = model.fit(xtrain, ytrain, batch_size=128, validation_data=(xval, yval), epochs=50, shuffle=True, verbose=1)

Train on 63879 samples, validate on 15970 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/5

### Evaluating Model

In [None]:
performance_curves(history, model.metrics_names)

In [None]:
scores = model.evaluate(xval, yval, verbose=0)

for metric_name, evalutaion in zip(model.metrics_names, scores):
  print(metric_name, evalutaion * 100)

In [None]:
model.save('CNN')  
del model 

import pandas as pd
df = pd.DataFrame(data=maptolable, index=[0])
df = (df.T)

df.to_csv('label_mappings.csv')


model = keras.models.load_model('CNN', custom_objects={'f1':f1})

scores = model.evaluate(Xfinal, Yonehot, verbose=0)
for metric_name, evalutaion in zip(model.metrics_names, scores):
  print(metric_name, evalutaion * 100)

## Generating Test Data Predictions

In [None]:
# !p7zip -d test.7z > /dev/null

In [None]:
# mappings = {}
# maptoLabel = {}
# with open('mappings.csv', mode='r') as infile:
#     reader = csv.reader(infile)

#     for i, row in enumerate(reader):
#         temp = [0]*12
#         temp[i] = 1
#         mappings[row[0]] = temp
#         maptoLabel[i] = row[0]

# test = []
# with open('sample_submission.csv',mode='r') as infile:
#     reader = csv.reader(infile)

#     for row in reader:
#         test.append(row[0])

# test.pop(0)
# print(len(test), test[0])


In [None]:
# model = keras.models.load_model('CNN', custom_objects={'f1':f1})

# test_audio_path = '/content/test/audio'

# ypred = []
# for filename in test:
#   audio = get_wav(test_audio_path + '/' + filename)
#   audio = audio.reshape(-1, 16000, 1)
#   pred = model.predict(audio)
#   ypred.append(np.argmax(pred[0][0], axis=0))

In [None]:
# import pandas as pd
# final = []

# for pred in ypred:
#   final.append(maptoLabel[pred])

# df = pd.DataFrame(final)
# df.to_csv ('final_df.csv', index = False)

In [None]:
print(maptolable)