In [None]:
import importlib

if importlib.util.find_spec('librosa') is None:
  ! pip install --user librosa
else:
  print('librosa already installed')

if importlib.util.find_spec('gradio') is None:
  ! pip install gradio
else:
  print('gradio already installed')

Collecting gradio
  Downloading gradio-3.37.0-py3-none-any.whl (19.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.100.0-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.7/65.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client>=0.2.10 (from gradio)
  Downloading gradio_client-0.2.10-py3-none-any.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.0/289.0 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from pathlib import Path
# from scipy.io import wavfile
# import scipy.signal
import pandas as pd
from tqdm.auto import tqdm
# import seaborn as sns
# import matplotlib.pyplot as plt
# from collections import Counter
import numpy as np
# import os
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization, Reshape
from tensorflow.keras.models import Sequential
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.regularizers import l1, l2
# from transformers import AutoProcessor, TFWav2Vec2Model
# from tensorflow.keras import mixed_precision
from sklearn.metrics import classification_report
import json
import librosa
import gradio as gr

## Mount Drive

In [None]:
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

ON_COLAB = is_running_on_colab()
ON_COLAB

True

In [None]:
if ON_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive')
  intermediate_folder = Path('/content/gdrive/MyDrive/Colab Notebooks/Speech recognition')
  # intermediate_folder = Path('/content/gdrive/MyDrive/Temp/Speech recognition project')
else:
  intermediate_folder = Path('..') / 'data' / 'intermediate'

Mounted at /content/gdrive


## Read and define model

In [None]:
model = Sequential([Conv2D(32, (2, 2), activation='relu', padding='same', input_shape=(32, 20, 1)),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Conv2D(32, (2, 2), activation='relu', padding='same'),
                    MaxPooling2D((2, 2)),
                    Flatten(),
                    # Dropout(.2),
                    # Dense(32, activation='relu'),
                    Dropout(.175),
                    Dense(20, activation='softmax')])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 20, 32)        160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 10, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 10, 32)        4128      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 5, 32)         0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 8, 5, 32)          4128      
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 4, 2, 32)         0

In [None]:
model.load_weights(intermediate_folder / 'best_model_mfcc_cnn.ckp')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7bb5180a5e70>

## Read data

In [None]:
X_test = np.load(intermediate_folder / 'test_main_1_sec_audio.npy')
# X_test = np.load(intermediate_folder / 'test_main_1_sec_audio.npy').astype(float)
X_test.shape

(4689, 16000)

In [None]:
y_test_labels = pd.read_csv(intermediate_folder / 'test_main_1_sec_labels.csv', header=None, index_col=False)[0]
y_test_labels.shape

(4689,)

In [None]:
with open(intermediate_folder / 'labels.json', 'r') as file:
    labels_dict = json.load(file)
labels_dict

{'down': 0,
 'eight': 1,
 'five': 2,
 'four': 3,
 'go': 4,
 'left': 5,
 'nine': 6,
 'no': 7,
 'off': 8,
 'on': 9,
 'one': 10,
 'right': 11,
 'seven': 12,
 'six': 13,
 'stop': 14,
 'three': 15,
 'two': 16,
 'up': 17,
 'yes': 18,
 'zero': 19}

In [None]:
# y_train = np.array([labels_dict[lbl] for lbl in y_train_labels])
# y_val = np.array([labels_dict[lbl] for lbl in y_val_labels])
y_test = np.array([labels_dict[lbl] for lbl in y_test_labels])

# del y_train_labels
# del y_val_labels
del y_test_labels

## Function of predict

In [None]:
MEAN = 33.06522526955586
STD = 142.73227249991973
MEAN, STD

(33.06522526955586, 142.73227249991973)

In [None]:
SAMPLE_RATE = 16000

In [None]:
labels_dict_reversed = {v:k for k, v in labels_dict.items()}
labels_dict_reversed

{0: 'down',
 1: 'eight',
 2: 'five',
 3: 'four',
 4: 'go',
 5: 'left',
 6: 'nine',
 7: 'no',
 8: 'off',
 9: 'on',
 10: 'one',
 11: 'right',
 12: 'seven',
 13: 'six',
 14: 'stop',
 15: 'three',
 16: 'two',
 17: 'up',
 18: 'yes',
 19: 'zero'}

In [None]:
def predict_with_expected(data, expected):
  data_transformed = np.expand_dims(np.expand_dims(librosa.feature.mfcc(y=data.astype(float), sr=SAMPLE_RATE).transpose(1, 0), axis=-1), axis=0)
  data_transformed = (data_transformed - MEAN) / STD
  y_pred = model.predict(data_transformed, verbose=0)
  print(f'Largest class: {y_pred.argmax(axis=1)[0]}, expected: {expected}, same: {y_pred.argmax(axis=1)[0] == expected}')

In [None]:
for ind in np.random.randint(len(X_test), size=10):
  predict_with_expected(X_test[ind], y_test[ind])

Largest class: 2, expected: 2, same: True
Largest class: 17, expected: 17, same: True
Largest class: 2, expected: 9, same: False
Largest class: 5, expected: 5, same: True
Largest class: 5, expected: 5, same: True
Largest class: 16, expected: 16, same: True
Largest class: 1, expected: 1, same: True
Largest class: 13, expected: 13, same: True
Largest class: 4, expected: 4, same: True
Largest class: 0, expected: 0, same: True


In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [None]:
def predict(data_mic, data_upload):
  data = data_mic if data_mic else data_upload
  if data:
    sample_rate = data[0]
    audio = data[1]
  else:
    return "", None

  # print(f'Before {sample_rate=}, {len(audio)=}')
  if len(audio) > sample_rate:
    audio = audio[:sample_rate]
  else:
    audio = arr_pad = np.pad(audio, (0, sample_rate - len(audio)))
  # print(f'After {sample_rate=}, {len(audio)=}')

  if sample_rate != SAMPLE_RATE:
    audio = librosa.resample(y=audio.astype(float), orig_sr=sample_rate, target_sr=SAMPLE_RATE)
  else:
    audio = audio.astype(float)

  data_transformed = np.expand_dims(np.expand_dims(librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE).transpose(1, 0), axis=-1), axis=0)
  data_transformed = (data_transformed - MEAN) / STD
  # print(f'Shape on input to model {data_transformed.shape=}')
  y_pred = model.predict(data_transformed, verbose=0)
  # print(f'{y_pred.shape=}')

  bar_data = pd.DataFrame({'certainty': (softmax(y_pred[0]) * 100).round(2),
                           'word': [labels_dict_reversed[ind] for ind in range(len(labels_dict_reversed))]}
                           ).sort_values(by='certainty', ascending=False)
  print(bar_data)

  return labels_dict_reversed[y_pred.argmax(axis=1)[0]], bar_data.iloc[:3]

## Gradio

In [None]:
demo = gr.Interface(fn=predict,
                    inputs=[gr.components.Audio(source="microphone", autoplay=True, show_label=False),
                            gr.components.Audio(source="upload", autoplay=True, show_label=False)],
                    outputs=[gr.components.Textbox(label='Result: Most probable word:', show_label=True),
                             gr.components.BarPlot(x='word', y='certainty', vertical=False, title='Result: Certainty per word', y_lim=[0, 100], width=500, show_label=False)],
                    live=True,
                    allow_flagging='never',
                    title='Classification of 1 second text sniplets',
                    # description='Say one of the following words: <h3>Numbers:</h3><ul><li>zero</li><li>one</li><li>two</li><li>three</li><li>four</li><li>five</li><li>six</li><li>seven</li><li>eight</li><li>nine</li></ul><h3>Additional words:</h3><ul><li>yes</li><li>no</li><li>down</li><li>up</li><li>left</li><li>right</li><li>go</li><li>stop</li><li>on</li><li>off</li></ul>')
                    # description="Say one of the following words: <table><tr><td valign='top'><h3>Numbers:</h3><ul><li>zero</li><li>one</li><li>two</li><li>three</li><li>four</li><li>five</li><li>six</li><li>seven</li><li>eight</li><li>nine</li></ul></td><td valign='top'><h3>Additional words:</h3><ul><li>yes</li><li>no</li><li>down</li><li>up</li><li>left</li><li>right</li><li>go</li><li>stop</li><li>on</li><li>off</li></ul></td></tr></table>")
                    # description="Say one of the following words: <table><tr><td valign='top'><h3>Numbers:</h3><ul style='list-style-type: none;'><li>zero</li><li>one</li><li>two</li><li>three</li><li>four</li><li>five</li><li>six</li><li>seven</li><li>eight</li><li>nine</li></ul></td><td valign='top'><h3>Additional words:</h3><ul style='list-style-type: none;'><li>yes</li><li>no</li><li>down</li><li>up</li><li>left</li><li>right</li><li>go</li><li>stop</li><li>on</li><li>off</li></ul></td></tr></table>""")
                    description="<h3>Record or upload a file with one of the following words. When recording, try to record around 1 second audio.</h3><table><tr><td valign='top'><h3>Numbers:</h3><p>zero, one, two, three, four, five, six, seven, eight, nine</p></td><td valign='top'><h3>Additional words:</h3><p>yes, no, down, up, left, right, go, stop, on, off</p></td></tr></table>")

demo.launch(debug=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

Before sample_rate=16000, len(audio)=16000
After sample_rate=16000, len(audio)=16000
Shape on input to model data_transformed.shape=(1, 32, 20, 1)
y_pred.shape=(1, 20)
    certainty   word
12      12.50  seven
0        4.61   down
1        4.61  eight
18       4.61    yes
17       4.61     up
16       4.61    two
15       4.61  three
14       4.61   stop
13       4.61    six
11       4.61  right
10       4.61    one
9        4.61     on
8        4.61    off
7        4.61     no
6        4.61   nine
5        4.61   left
4        4.61     go
3        4.61   four
2        4.61   five
19       4.61   zero
