In [2]:
# ! pip install --user librosa

In [3]:
from pathlib import Path
from scipy.io import wavfile
import scipy.signal
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1, l2

## Mount Drive

In [4]:
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

ON_COLAB = is_running_on_colab()
ON_COLAB

True

In [5]:
if ON_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive')
  # intermediate_folder = Path('/content/gdrive/MyDrive/Colab Notebooks/Speech recognition')
  intermediate_folder = Path('/content/gdrive/MyDrive/Temp/Speech recognition project')
else:
  intermediate_folder = Path('..') / 'data' / 'intermediate'

Mounted at /content/gdrive


## Read data

In [6]:
X_train = np.load(intermediate_folder / 'train_main_1_sec_audio_mfcc.npy').transpose(0, 2, 1)
X_train.shape

(33566, 32, 20)

In [7]:
X_val = np.load(intermediate_folder / 'val_main_1_sec_audio_mfcc.npy').transpose(0, 2, 1)
X_val.shape

(4619, 32, 20)

In [8]:
# X_test = np.load(intermediate_folder / 'test_main_1_sec_audio_stft_scipy.npy').transpose(0, 2, 1)
# X_test.shape

In [9]:
y_train_labels = pd.read_csv(intermediate_folder / 'train_main_1_sec_labels.csv', header=None, index_col=False)[0]
y_val_labels = pd.read_csv(intermediate_folder / 'val_main_1_sec_labels.csv', header=None, index_col=False)[0]
# y_test_labels = pd.read_csv(intermediate_folder / 'test_main_1_sec_labels.csv', header=None, index_col=False)[0]
y_train_labels.shape, y_val_labels.shape
# y_train_labels.shape, y_val_labels.shape, y_test_labels.shape

((33566,), (4619,))

In [10]:
le = LabelEncoder()
le.fit(y_train_labels)

y_train = le.transform(y_train_labels)
y_val = le.transform(y_val_labels)
# y_test = le.transform(y_test_labels)
y_train.shape, y_val.shape
# y_train.shape, y_val.shape, y_test.shape

((33566,), (4619,))

In [11]:
pd.Series(y_train).value_counts().sort_index()

0     1667
1     1655
2     1696
3     1662
4     1647
5     1683
6     1723
7     1630
8     1668
9     1650
10    1672
11    1687
12    1708
13    1727
14    1715
15    1672
16    1693
17    1591
18    1686
19    1734
dtype: int64

In [12]:
X_val.shape

(4619, 32, 20)

In [13]:
assert np.mean(X_val[:, :, 0]) == np.mean(X_val.transpose(0, 2, 1)[:, 0, :])

## Standardize

In [14]:
MEAN = X_train.mean()
STD = np.std(X_train)
MEAN, STD

(33.065225269555995, 142.73227249991956)

In [15]:
X_train = (X_train - MEAN) / STD
X_val = (X_val - MEAN) / STD
# X_test = (X_test - MEAN) / STD
X_train.shape, X_val.shape
# X_train.shape, X_val.shape, X_test.shape

((33566, 32, 20), (4619, 32, 20))

## Functions

In [16]:
def train(model, patience_train, patience_val=None, learning_rate=.001, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, epochs=20, batch_size=128, cont_train=False):
  print(f'{patience_train=}, {patience_val=}, {learning_rate=}, {epochs=}, {batch_size=}, {cont_train=}, {X_train.shape=}, {y_train.shape=}, {X_val.shape=}, {y_val.shape=}')
  model.summary()
  print(f'Input shape {model.input_shape}, output shape {model.output_shape}')
  early_stopping = []
  if patience_train:
    early_stopping.append(EarlyStopping(monitor='loss', patience=patience_train))
  if patience_val:
    early_stopping.append(EarlyStopping(monitor='loss', patience=patience_val))

  if not cont_train:
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
  model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), callbacks=[early_stopping]);

## CNN

In [16]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 10, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 5, 32)         0         
 2D)                                                             
                                              

In [17]:
model = Sequential([Conv2D(16, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(16, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(16, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 32, 20, 16)        80        
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 16, 10, 16)       0         
 2D)                                                             
                                                                 
 conv2d_4 (Conv2D)           (None, 16, 10, 16)        1040      
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 8, 5, 16)         0         
 2D)                                                             
                                            

In [18]:
model = Sequential([Conv2D(16, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 32, 20, 16)        80        
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 16, 10, 16)       0         
 2D)                                                             
                                                                 
 conv2d_7 (Conv2D)           (None, 16, 10, 32)        2080      
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 8, 5, 32)         0         
 2D)                                                             
                                            

In [19]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(16, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_9 (Conv2D)           (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 16, 10, 32)       0         
 2D)                                                             
                                                                 
 conv2d_10 (Conv2D)          (None, 16, 10, 16)        2064      
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 8, 5, 16)         0         
 g2D)                                                            
                                            

In [20]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(16, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='sigmoid'),
                    # Dropout(.5),
                    # Dense(128, activation='sigmoid'),
                    # Dropout(.2),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.01,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.01, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 16, 10, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_13 (Conv2D)          (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 8, 5, 32)         0         
 g2D)                                                            
                                            

In [22]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='relu'),
                    Dropout(.1),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.001,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.001, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_18 (Conv2D)          (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d_18 (MaxPoolin  (None, 16, 10, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_19 (Conv2D)          (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_19 (MaxPoolin  (None, 8, 5, 32)         0         
 g2D)                                                            
                                           

In [23]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='relu'),
                    Dropout(.15),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=None,
      learning_rate=.001,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=None, learning_rate=0.001, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_21 (Conv2D)          (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d_21 (MaxPoolin  (None, 16, 10, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_22 (Conv2D)          (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_22 (MaxPoolin  (None, 8, 5, 32)         0         
 g2D)                                                            
                                           

In [24]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='relu'),
                    Dropout(.15),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=5,
      learning_rate=.00001,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=5, learning_rate=1e-05, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_24 (Conv2D)          (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d_24 (MaxPoolin  (None, 16, 10, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_25 (Conv2D)          (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_25 (MaxPoolin  (None, 8, 5, 32)         0         
 g2D)                                                            
                                              

In [17]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='relu'),
                    Dropout(.15),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=10,
      patience_val=5,
      learning_rate=.0001,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=10, patience_val=5, learning_rate=0.0001, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 10, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 5, 32)         0         
 2D)                                                             
                                               

In [18]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='relu'),
                    Dropout(.15),
                    Dense(20, activation='softmax')])

train(model=model,
      patience_train=15,
      patience_val=15,
      learning_rate=.0001,
      epochs=1000,
      batch_size=256,
      # cont_train=True,
      )

patience_train=15, patience_val=15, learning_rate=0.0001, epochs=1000, batch_size=256, cont_train=False, X_train.shape=(33566, 32, 20), y_train.shape=(33566,), X_val.shape=(4619, 32, 20), y_val.shape=(4619,)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 16, 10, 32)       0         
 2D)                                                             
                                                                 
 conv2d_4 (Conv2D)           (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 8, 5, 32)         0         
 2D)                                                             
                                            

## End