In [None]:
from pathlib import Path
from scipy.io import wavfile
import scipy.signal
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

## Mount Drive

In [None]:
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

ON_COLAB = is_running_on_colab()
ON_COLAB

True

In [None]:
if ON_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive')
  intermediate_folder = Path('/content/gdrive/MyDrive/Temp/Speech recognition project')
else:
  intermediate_folder = Path('..') / 'data' / 'intermediate'


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Read files

In [None]:
X_train = np.load(intermediate_folder / 'train_main_1_sec_audio.npy')
X_train.shape

(33566, 16000)

In [None]:
X_val = np.load(intermediate_folder / 'val_main_1_sec_audio.npy')
X_val.shape

(4619, 16000)

In [None]:
# X_test = np.load(intermediate_folder / 'test_main_1_sec_audio.npy')
# X_test.shape

In [None]:
y_train_labels = pd.read_csv(intermediate_folder / 'train_main_1_sec_labels.csv', header=None, index_col=False)[0]
y_val_labels = pd.read_csv(intermediate_folder / 'val_main_1_sec_labels.csv', header=None, index_col=False)[0]
# y_test_labels = pd.read_csv(intermediate_folder / 'test_main_1_sec_labels.csv', header=None, index_col=False)[0]
y_train_labels.shape, y_val_labels.shape
# y_train_labels.shape, y_val_labels.shape, y_test_labels.shape

((33566,), (4619,))

In [None]:
le = LabelEncoder()
le.fit(y_train_labels)

y_train = le.transform(y_train_labels)
y_val = le.transform(y_val_labels)
# y_test = le.transform(y_test_labels)
y_train.shape, y_val.shape
# y_train.shape, y_val.shape, y_test.shape

((33566,), (4619,))

In [None]:
pd.Series(y_train).value_counts().sort_index()

0     1667
1     1655
2     1696
3     1662
4     1647
5     1683
6     1723
7     1630
8     1668
9     1650
10    1672
11    1687
12    1708
13    1727
14    1715
15    1672
16    1693
17    1591
18    1686
19    1734
dtype: int64

## RNN - 50

In [None]:
lstm = Sequential([LSTM(50, input_shape=(16000, 1)),
                  Dense(20, activation='softmax')]
                  )
lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 50)                10400     
                                                                 
 dense (Dense)               (None, 20)                1020      
                                                                 
Total params: 11,420
Trainable params: 11,420
Non-trainable params: 0
_________________________________________________________________


In [None]:
lstm.input_shape, lstm.output_shape

((None, 16000, 1), (None, 20))

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
learning_rate = 0.001

lstm.compile(optimizer=Adam(learning_rate=learning_rate),
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
# learning_rate = 0.001
lstm.fit(X_train[:, :, None], y_train, batch_size=128, epochs=20, validation_data=(X_val[:, :, None], y_val), callbacks=[early_stopping]);

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# learning_rate = 0.005
lstm.fit(X_train[:, :, None], y_train, batch_size=128, epochs=10, validation_data=(X_val[:, :, None], y_val), callbacks=[early_stopping]);

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


In [None]:
# learning_rate = 0.001
lstm.fit(X_train[:, :, None], y_train, batch_size=128, epochs=20, validation_data=(X_val[:, :, None], y_val), callbacks=[early_stopping]);

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


## RNN - 100 + Fully connected 100 + Bidirectional

In [None]:
lstm = Sequential([Bidirectional(LSTM(70), input_shape=(16000, 1)),
                   Dense(100, activation='sigmoid'),
                   Dense(20, activation='softmax')]
                  )
lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 140)              40320     
 l)                                                              
                                                                 
 dense (Dense)               (None, 100)               14100     
                                                                 
 dense_1 (Dense)             (None, 20)                2020      
                                                                 
Total params: 56,440
Trainable params: 56,440
Non-trainable params: 0
_________________________________________________________________


In [None]:
lstm.input_shape, lstm.output_shape

((None, 16000, 1), (None, 20))

In [None]:
callbacks = [EarlyStopping(monitor='loss', patience=4, min_delta=.001),
             EarlyStopping(monitor='val_loss', patience=10, min_delta=.001)]

In [None]:
learning_rate = 0.001

lstm.compile(optimizer=Adam(learning_rate=learning_rate),
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
# learning_rate = 0.001
lstm.fit(X_train[:, :, None], y_train, batch_size=128, epochs=20, validation_data=(X_val[:, :, None], y_val), callbacks=[early_stopping]);

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20

KeyboardInterrupt: ignored

## RNN - 2 bidirectional LSTM + Fully connected 100

In [None]:
lstm = Sequential([Bidirectional(LSTM(32, return_sequences=True), input_shape=(16000, 1)),
                  #  Dropout(.5),
                   Bidirectional(LSTM(32)),
                  #  Dropout(.5),
                   Dense(100, activation='sigmoid'),
                  #  Dropout(.5),
                   Dense(20, activation='softmax')]
                  )
lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 16000, 64)        8704      
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               24832     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 100)               6500      
                                                                 
 dense_1 (Dense)             (None, 20)                2020      
                                                                 
Total params: 42,056
Trainable params: 42,056
Non-trainable params: 0
_________________________________________________________________


In [None]:
lstm.input_shape, lstm.output_shape

((None, 16000, 1), (None, 20))

In [None]:
early_stopping = EarlyStopping(monitor='loss', patience=4, min_delta=.001)
early_stopping_val = EarlyStopping(monitor='val_loss', patience=10, min_delta=.001)
callbacks = [early_stopping, early_stopping_val]

In [None]:
learning_rate = 0.001

lstm.compile(optimizer=Adam(learning_rate=learning_rate),
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
# learning_rate = 0.001
lstm.fit(X_train[:, :, None], y_train, batch_size=64, epochs=20, validation_data=(X_val[:, :, None], y_val),
         callbacks=[early_stopping]);

Epoch 1/20
 23/525 [>.............................] - ETA: 12:55:24 - loss: 3.0537 - accuracy: 0.0510

KeyboardInterrupt: ignored

## Report

In [None]:
y_pred_lstm = lstm.predict(X_val[:, :, None])
print(classification_report(y_val, y_pred_lstm.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.01      0.00      0.01       247
           1       0.10      0.03      0.04       226
           2       0.07      0.05      0.06       224
           3       0.00      0.00      0.00       261
           4       0.08      0.05      0.06       231
           5       0.05      0.03      0.03       233
           6       0.00      0.00      0.00       215
           7       0.07      0.01      0.02       235
           8       0.00      0.00      0.00       232
           9       0.04      0.09      0.06       234
          10       0.00      0.00      0.00       209
          11       0.03      0.03      0.03       235
          12       0.06      0.12      0.08       240
          13       0.07      0.16      0.10       241
          14       0.02      0.00      0.01       225
          15       0.05      0.01      0.02       225
          16       0.05      0.14      0.07       214
          17       0.06    