In [None]:
# ! pip install --user librosa

In [None]:
from pathlib import Path
from scipy.io import wavfile
import scipy.signal
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization, Activation
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1, l2

## Mount Drive

In [None]:
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

ON_COLAB = is_running_on_colab()
ON_COLAB

True

In [None]:
if ON_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive')
  intermediate_folder = Path('/content/gdrive/MyDrive/Colab Notebooks/Speech recognition')
  # intermediate_folder = Path('/content/gdrive/MyDrive/Temp/Speech recognition project')
else:
  intermediate_folder = Path('..') / 'data' / 'intermediate'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Read data

In [None]:
X_train = np.load(intermediate_folder / 'train_main_1_sec_audio_stft_librosa.npy').transpose(0, 2, 1)
X_train.shape

(33566, 32, 1025)

In [None]:
X_val = np.load(intermediate_folder / 'val_main_1_sec_audio_stft_librosa.npy').transpose(0, 2, 1)
X_val.shape

(4619, 32, 1025)

In [None]:
# X_test = np.load(intermediate_folder / 'test_main_1_sec_audio_stft_librosa.npy').transpose(0, 2, 1)
# X_test.shape

In [None]:
y_train_labels = pd.read_csv(intermediate_folder / 'train_main_1_sec_labels.csv', header=None, index_col=False)[0]
y_val_labels = pd.read_csv(intermediate_folder / 'val_main_1_sec_labels.csv', header=None, index_col=False)[0]
# y_test_labels = pd.read_csv(intermediate_folder / 'test_main_1_sec_labels.csv', header=None, index_col=False)[0]
y_train_labels.shape, y_val_labels.shape
# y_train_labels.shape, y_val_labels.shape, y_test_labels.shape

((33566,), (4619,))

In [None]:
le = LabelEncoder()
le.fit(y_train_labels)
y_train = le.transform(y_train_labels)
y_val = le.transform(y_val_labels)
# y_test = le.transform(y_test_labels)
y_train.shape, y_val.shape
# y_train.shape, y_val.shape, y_test.shape

((33566,), (4619,))

In [None]:
y_train_labels.value_counts().sort_index()

down     1667
eight    1655
five     1696
four     1662
go       1647
left     1683
nine     1723
no       1630
off      1668
on       1650
one      1672
right    1687
seven    1708
six      1727
stop     1715
three    1672
two      1693
up       1591
yes      1686
zero     1734
Name: 0, dtype: int64

In [None]:
pd.Series(y_train).value_counts().sort_index()

0     1667
1     1655
2     1696
3     1662
4     1647
5     1683
6     1723
7     1630
8     1668
9     1650
10    1672
11    1687
12    1708
13    1727
14    1715
15    1672
16    1693
17    1591
18    1686
19    1734
dtype: int64

In [None]:
X_val.shape

(4619, 32, 1025)

In [None]:
assert np.mean(X_val[:, :, 0]) == np.mean(X_val.transpose(0, 2, 1)[:, 0, :])

In [None]:
y_train[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
indxs_X_train = np.random.permutation(len(X_train))
X_train = X_train[indxs_X_train]
y_train = y_train[indxs_X_train]
y_train[:10]

array([ 3, 10, 15, 16,  0,  1,  7,  7,  9, 13])

## Standardize

In [None]:
MEAN = X_train.mean()
STD = np.std(X_train)
MEAN, STD

(10463.064529914764, 73450.26476420577)

In [None]:
X_train = (X_train - MEAN) / STD
X_val = (X_val - MEAN) / STD
# X_test = (X_test - MEAN) / STD
X_train.shape, X_val.shape
# X_train.shape, X_val.shape, X_test.shape

((33566, 32, 1025), (4619, 32, 1025))

## Functions

In [None]:
def train(model, patience_train, patience_val=None, learning_rate=.001, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, epochs=20, batch_size=128, cont_train=False):
  print(f'{patience_train=}, {patience_val=}, {learning_rate=}, {epochs=}, {batch_size=}, {cont_train=}, {X_train.shape=}, {y_train.shape=}, {X_val.shape=}, {y_val.shape=}')
  model.summary()
  print(f'Input shape {model.input_shape}, output shape {model.output_shape}')
  early_stopping = []
  if patience_train:
    early_stopping.append(EarlyStopping(monitor='loss', patience=patience_train))
  if patience_val:
    early_stopping.append(EarlyStopping(monitor='loss', patience=patience_val))

  if not cont_train:
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
  model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), callbacks=[early_stopping]);

## CNN

In [None]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.001,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.001, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 512, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 512, 32)       4128      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 256, 32)       0         
 2D)                                                             
                                         

In [None]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.001,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.001, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 512, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 512, 32)       4128      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 256, 32)       0         
 2D)                                                             
                                         

In [None]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 512, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 512, 32)       4128      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 256, 32)       0         
 2D)                                                             
                                           

In [None]:
model = Sequential([Conv2D(32, (2, 4), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 4)),
                    Conv2D(32, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(32, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(32, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      288       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 256, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 256, 32)       8224      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 64, 32)        0         
 2D)                                                             
                                           

In [None]:
model = Sequential([Conv2D(32, (3, 6), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((3, 6)),
                    Conv2D(32, (3, 6), activation='relu', padding='same'),
                    MaxPooling2D((3, 6)),
                    Conv2D(32, (3, 6), activation='relu', padding='same'),
                    MaxPooling2D((3, 6)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      608       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 10, 170, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 10, 170, 32)       18464     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 3, 28, 32)        0         
 2D)                                                             
                                           

In [None]:
model = Sequential([Conv2D(32, (3, 6), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((3, 6)),
                    Conv2D(64, (3, 6), activation='relu', padding='same'),
                    MaxPooling2D((3, 6)),
                    Conv2D(128, (3, 6), activation='relu', padding='same'),
                    MaxPooling2D((3, 6)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(128, activation='relu'),
                    # Dropout(.5),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      608       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 10, 170, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 10, 170, 64)       36928     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 3, 28, 64)        0         
 2D)                                                             
                                           

In [None]:
model = Sequential([Conv2D(32, (3, 6), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((3, 6)),
                    Conv2D(64, (3, 6), activation='relu', padding='same'),
                    MaxPooling2D((3, 6)),
                    Conv2D(128, (3, 6), activation='relu', padding='same'),
                    MaxPooling2D((3, 6)),
                    Flatten(),
                    # Dropout(.2),
                    Dense(128, activation='relu'),
                    # Dropout(.5),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      608       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 10, 170, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 10, 170, 64)       36928     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 3, 28, 64)        0         
 2D)                                                             
                                           

In [None]:
model = Sequential([Conv2D(32, (2, 4), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 4)),
                    Conv2D(64, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(128, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(256, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      288       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 256, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 256, 64)       16448     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 64, 64)        0         
 2D)                                                             
                                           

In [None]:
model = Sequential([Conv2D(32, (2, 4), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 4)),
                    Conv2D(64, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(64, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(64, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      288       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 256, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 256, 64)       16448     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 64, 64)        0         
 2D)                                                             
                                           

In [None]:
model = Sequential([Conv2D(32, (2, 4), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 4)),
                    Conv2D(64, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(128, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Conv2D(128, (2, 4), activation='relu', padding='same'),
                    MaxPooling2D((2, 4)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

In [None]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 1025, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    Dropout(.3),
                    # Dense(32, activation='rele'),
                    # Dropout(.5),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=3,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=512,
      # cont_train=True,
      )

patience_train=3, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=512, cont_train=False, X_train.shape=(33566, 32, 1025), y_train.shape=(33566,), X_val.shape=(4619, 32, 1025), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 1025, 32)      160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 512, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 512, 32)       4128      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 256, 32)       0         
 2D)                                                             
                                           

## End