### Common Voice
##### 500 hours of speech recordings, with speaker demographics

*  1. Purpose of this notebook is to explore demographics data and filter audio files 
*  2. Convert MP3  audio files to to wav 

##### Author - James, Aalok, Reshma


The corpus is split into several parts for your convenience. 
The subsets with “valid” in their name are audio clips that have had at least 2 people listen to them, and the majority of those listeners say the audio matches the text. 
The subsets with “invalid” in their name are clips that have had at least 2 listeners, and the majority say the audio does not match the clip.
All other clips, ie. those with fewer than 2 votes, or those that have equal valid and invalid votes, have “other” in their name.

* The “valid” and “other” subsets are further divided into 3 groups:
* dev - for development and experimentation
* train - for use in speech recognition training
* test - for testing word error rate

Each row of a csv file represents a single audio clip, and contains the following information:

* filename - relative path of the audio file
* text - supposed transcription of the audio
* up_votes - number of people who said audio matches the text
* down_votes - number of people who said audio does not match text
* age - age of the speaker, if the speaker reported it
* gender - gender of the speaker, if the speaker reported it
* accent - accent of the speaker, if the speaker reported it


* Age details
    * teens: '< 19'
    * twenties: '19 - 29'
    * thirties: '30 - 39'
    * fourties: '40 - 49'
    * fifties: '50 - 59'
    * sixties: '60 - 69'
    * seventies: '70 - 79'
    * eighties: '80 - 89'
    * nineties: '> 89'
    
    
* Gender details
    * male
    * female
    * other


* Accent details
    * us: 'United States English'
    * australia: 'Australian English'
    * england: 'England English'
    * canada: 'Canadian English'
    * philippines: 'Filipino'
    * hongkong: 'Hong Kong English'
    * indian: 'India and South Asia (India, Pakistan, Sri Lanka)'
    * ireland: 'Irish English'
    * malaysia: 'Malaysian English'
    * newzealand: 'New Zealand English'
    * scotland: 'Scottish English'
    * singapore: 'Singaporean English'
    * southatlandtic: 'South Atlantic (Falkland Islands, Saint Helena)'
    * african: 'Southern African (South Africa, Zimbabwe, Namibia)'
    * wales: 'Welsh English'
    * bermuda: 'West Indies and Bermuda (Bahamas, Bermuda, Jamaica, Trinidad)'

In [1]:
import playsound
import ipywidgets as widgets
from ipywidgets import HBox, VBox
import matplotlib.pyplot as plt
from IPython.display import display
import tensorflow.keras
import collections
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline
import seaborn as sns
import keras

ModuleNotFoundError: No module named 'playsound'

In [None]:
#util functions
import logging
import os
from collections import defaultdict
from pathlib import Path
from scipy.ndimage.morphology import binary_dilation
from typing import Optional, Union
import webrtcvad
import struct
from scipy import stats
import seaborn as sn
import matplotlib.pyplot as plt

import librosa
import pandas as pd
import numpy as np
from tqdm import tqdm

from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
from python_speech_features import fbank
import scipy.io.wavfile as wav

import itertools

from keras.models import Sequential
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import LSTM, Dense, Dropout, Flatten,LeakyReLU, Input, SpatialDropout1D, Bidirectional
from keras.models import Sequential
from keras.optimizers import Adam, SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.utils import np_utils
import tensorflow.keras.backend as K
from sklearn.metrics import classification_report, confusion_matrix
import time
date     = '1003'
np.random.seed(1337)  # for reproducibility

# from constants import SAMPLE_RATE, NUM_FBANKS
# from utils import find_files, ensures_dir

logger = logging.getLogger(__name__)

# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
# This sets the granularity of the VAD. Should not need to be changed.
vad_window_length = 30  # In milliseconds
# Number of frames to average together when performing the moving average smoothing.
# The larger this value, the larger the VAD variations must be to not get smoothed out. 
vad_moving_average_width = 8
# Maximum number of consecutive silent frames a segment can have.
vad_max_silence_length = 6

## Audio volume normalization
audio_norm_target_dBFS = -30
int16_max              = (2 ** 15) - 1

def trim_long_silences(wav):
    """
    Ensures that segments without voice in the waveform remain no longer than a 
    threshold determined by the VAD parameters in params.py.
    :param wav: the raw waveform as a numpy array of floats 
    :return: the same waveform with silences trimmed away (length <= original wav length)
    """
    # Compute the voice detection window size
    vad_window_length = 30 
    sampling_rate     = 16000
    samples_per_window = (vad_window_length * sampling_rate) // 1000
    
    # Trim the end of the audio to have a multiple of the window size
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
    
    # Convert the float waveform to 16-bit mono PCM
    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
    
    # Perform voice activation detection
    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                                         sample_rate=sampling_rate))
    voice_flags = np.array(voice_flags)
    
    # Smooth the voice detection with a moving average
    def moving_average(array, width):
        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width
    
    audio_mask = moving_average(voice_flags, vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(np.bool)
    
    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)
    
    return wav[audio_mask == True]

def normalize_frames(m,Scale=True):
    if Scale:
        return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12)
    else:
        return (m - np.mean(m, axis=0))

def get_wav(filename):
    '''
    Load wav file from disk
    :param language_num (list): list of file names
    :return (numpy array): wav files
    '''
    wav, sr = librosa.load(filename)
    return(wav)

def get_kaldi_features_pred32(wav, dummies_):
    '''
    Get Kaldi - Discrete FFT features
    :param wav_: list of trimmed wav file
    :param y   : Array of accents
    :param filename: Array of filenames 
    :return (numpy array): array of (mfcc, filter_banks, delta_1, delta_2), accent array (utternace level), dict(filename,number of frames)
    '''
    n_mfcc   = 13
    n_filt   = 32
    if len(wav) > 0:
        mfcc_                  = mfcc(wav, samplerate=16000, winlen=0.025, winstep=0.01, numcep=n_mfcc)
        filter_banks, energies = fbank(wav, samplerate=16000, nfilt=n_filt)
        filter_banks           = 20 * np.log10(np.maximum(filter_banks,1e-5))
        delta_1                = delta(filter_banks, N=1)
        delta_2                = delta(delta_1, N=1)

        filter_banks = normalize_frames(filter_banks, Scale=True)
        delta_1      = normalize_frames(delta_1, Scale=True)
        delta_2      = normalize_frames(delta_2, Scale=True)
        dummies      = np.array(list(itertools.repeat(list(dummies_), len(mfcc_))))
        frames_features = np.hstack([mfcc_, filter_banks, delta_1, delta_2, dummies])
        #print(len(frames_features))
    return frames_features

def get_kaldi_features_pred64(wav, dummies_):
    '''
    Get Kaldi - Discrete FFT features
    :param wav_: list of trimmed wav file
    :param y   : Array of accents
    :param filename: Array of filenames 
    :return (numpy array): array of (mfcc, filter_banks, delta_1, delta_2), accent array (utternace level), dict(filename,number of frames)
    '''
    n_mfcc   = 13
    n_filt   = 64
    if len(wav) > 0:
        mfcc_                  = mfcc(wav, samplerate=16000, winlen=0.025, winstep=0.01, numcep=n_mfcc)
        filter_banks, energies = fbank(wav, samplerate=16000, nfilt=n_filt)
        filter_banks           = 20 * np.log10(np.maximum(filter_banks,1e-5))
        delta_1                = delta(filter_banks, N=1)
        delta_2                = delta(delta_1, N=1)

        filter_banks = normalize_frames(filter_banks, Scale=True)
        delta_1      = normalize_frames(delta_1, Scale=True)
        delta_2      = normalize_frames(delta_2, Scale=True)
        dummies      = np.array(list(itertools.repeat(list(dummies_), len(mfcc_))))
        frames_features = np.hstack([mfcc_, filter_banks, delta_1, delta_2, dummies])
        #print(len(frames_features))
    return frames_features

def convo_f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
upload_audio = widgets.FileUpload(
    accept='.wav',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=False,  # True to accept multiple files upload else False
    description='Select a file'
)

drop_gender = widgets.Dropdown(
    options=['male', 'female'],
    value='female',
    description='Gender:',
    disabled=False,
)
    
drop_age = widgets.Dropdown(
    options=['eighties', 'fifties','fourties', 'seventies', 'sixties', 'teens', 'thirties', 'twenties'],
    value='teens',
    description='Age:',
    disabled=False,
)

button_predict = widgets.Button(
    description='Predict',
)

button_play = widgets.Button(
    description='Play Audio',
)

######################################## Reshma CNN1D #######################################################
def model32(test_trim, dummy_dict):
    test_features_32   = get_kaldi_features_pred32(test_trim, dummy_dict.values())
    dependencies       = {'convo_f1_score': convo_f1_score}
    model32            = tensorflow.keras.models.load_model('C:/Users/wanyi/Desktop/Uchicago/Deep Learning/Group Project/Fixed_Window/Reshma_final_code/Demo/models/nn_conv1d_32.h5',custom_objects=dependencies)
    xtt                = test_features_32.shape
    test_X_32          = np.reshape(test_features_32, (xtt[0], xtt[1], 1))
    y_pred_test_32     = model32.predict(test_X_32)
    y_pred_test_cls_32 = np.argmax(y_pred_test_32, axis=1)
#   print (len(y_pred_test))
    print("I am working hard to get the prediction!!!")
    #predicted
    all_classes    = ['US', 'Australia', 'England', 'Indian', 'Canada']
    demo_pred_32   = collections.Counter(y_pred_test_cls_32)
#   print ('iamdemo32:',y_pred_test_cls_32)
    results_32     = {all_classes[k] : v for k, v in demo_pred_32.items()}
    results_df_32  = pd.DataFrame(results_32.items(), columns=['Accents', 'Counts'])
    top_accent_32  = sorted(results_32.items(), key=lambda x: x[1], reverse=True)[0][0]
    return results_df_32, top_accent_32

######################################## Reshma BLSTM #######################################################
def model32_blstm(test_trim, dummy_dict):
    test_features_32   = get_kaldi_features_pred32(test_trim, dummy_dict.values())
    dependencies       = {'convo_f1_score': convo_f1_score}
    model32            = tensorflow.keras.models.load_model('C:/Users/wanyi/Desktop/Uchicago/Deep Learning/Group Project/Fixed_Window/Reshma_final_code/Demo/models/nn_blstm.h5',custom_objects=dependencies)
    xtt                = test_features_32.shape
    test_X_32          = np.reshape(test_features_32, (xtt[0], xtt[1], 1))
    y_pred_test_32     = model32.predict(test_X_32)
    y_pred_test_cls_32 = np.argmax(y_pred_test_32, axis=1)
#   print (len(y_pred_test))
    #predicted
    all_classes    = ['US', 'Australia', 'England', 'Indian', 'Canada']
    demo_pred_32   = collections.Counter(y_pred_test_cls_32)
#   print ('iamdemo32:',y_pred_test_cls_32)
    results_32     = {all_classes[k] : v for k, v in demo_pred_32.items()}
    results_df_32  = pd.DataFrame(results_32.items(), columns=['Accents', 'Counts'])
    top_accent_32  = sorted(results_32.items(), key=lambda x: x[1], reverse=True)[0][0]
    return results_df_32, top_accent_32


########################################## Aalok ########################################################
def model32_alk(test_trim, dummy_dict):
    test_features_32   = get_kaldi_features_pred32(test_trim, dummy_dict.values())
    dependencies       = {'convo_f1_score': convo_f1_score}
    model32_alk        = keras.models.load_model('C:/Users/wanyi/Desktop/Uchicago/Deep Learning/Group Project/Fixed_Window/Reshma_final_code/Demo/models/aalok_32.h5',custom_objects=dependencies)
    xtt                = test_features_32.shape
    test_X_32          = np.reshape(test_features_32, (xtt[0], xtt[1], 1))
    y_pred_test_32     = model32_alk.predict(test_X_32)
    y_pred_test_cls_32 = np.argmax(y_pred_test_32, axis=1)
#     print (len(y_pred_test))
    print("Yes, I am still running!!!")
    #predicted
    all_classes    = ['US', "Non-US"]
    demo_pred_32   = collections.Counter(y_pred_test_cls_32)
#     print ('iamdemo32:',y_pred_test_cls_32)
    results_32     = {all_classes[k] : v for k, v in demo_pred_32.items()}
    results_df_32  = pd.DataFrame(results_32.items(), columns=['Accents', 'Counts'])
    top_accent_32  = sorted(results_32.items(), key=lambda x: x[1], reverse=True)[0][0]
    return results_df_32, top_accent_32

########################################### James #############################################

def model64(test_trim, dummy_dict):
    test_features   = get_kaldi_features_pred64(test_trim, dummy_dict.values())
    dependencies    = {'convo_f1_score': convo_f1_score}
    model64         = tensorflow.keras.models.load_model('C:/Users/wanyi/Desktop/Uchicago/Deep Learning/Group Project/Fixed_Window/Reshma_final_code/Demo/models/nn_conv1d_64.h5',custom_objects=dependencies)
    xtt             = test_features.shape
    test_X          = np.reshape(test_features, (xtt[0], xtt[1], 1))
    y_pred_test     = model64.predict(test_X)
    y_pred_test_cls = np.argmax(y_pred_test, axis=1)
#     print (y_pred_test_cls)
    print("Almost there, be patient!!!")
    #predicted
    all_classes = ['North America', 'Europe', 'Indian', 'Oceania']
    demo_pred   = collections.Counter(y_pred_test_cls)
    results = {all_classes[k] : v for k, v in demo_pred.items()}
    results_df = pd.DataFrame(results.items(), columns=['Accents', 'Counts'])
    top_accent  = sorted(results.items(), key=lambda x: x[1], reverse=True)[0][0]
    return results_df, top_accent
    
def my_pred(b=None):
    dummy_dict      = {'female':0, 'male':0, 'eighties':0, 'fifties':0,'fourties':0, 'seventies':0, 'sixties':0, 'teens':0, 'thirties':0, 'twenties':0}
    audio_file      = next(iter(upload_audio.value))
    path            = 'C:/Users/wanyi/Desktop/Uchicago/Deep Learning/Group Project/Fixed_Window/Reshma_final_code/Demo/Examples/'
    dummy_dict[drop_gender.value]  = 1
    dummy_dict[drop_age.value]     = 1
    #print (audio_file,drop_gender.value, drop_age.value)
    test_wav        = get_wav(path+audio_file) #get wav file
    test_trim       = trim_long_silences(test_wav) # trim silences
    results_32, accent_32 = model32(test_trim, dummy_dict)
    results_64, accent_64 = model64(test_trim, dummy_dict)
    results_32_alk, accent_32_alk = model32_alk(test_trim, dummy_dict)
    results_32_blstm, accent_32_blstm = model32_blstm(test_trim, dummy_dict)
    fig, axs = plt.subplots(1, 4, figsize=(18,4))
    axs[0].bar(results_32["Accents"].values,results_32["Counts"].values)
    axs[0].set_title('Top5Accent CNN Model is:'+ accent_32)
    axs[1].bar(results_32_blstm["Accents"].values,results_32_blstm["Counts"].values, color = "C3")
    axs[1].set_title('Top5Accent BLSTM Model is:'+ accent_32_blstm)
    axs[2].bar(results_32_alk["Accents"].values,results_32_alk["Counts"].values, color = 'C1')
    axs[2].set_title('US/Non-US Model CNN is:'+ accent_32_alk)
    axs[3].bar(results_64["Accents"].values,results_64["Counts"].values,color='C2')
    axs[3].set_title('Continent Model CNN is:'+ accent_64)

    plt.show()


In [None]:
upload_audio.value

In [None]:
@button_predict.on_click
def pred_button_clicked(b):
        my_pred()
        
def listen_to_speech(b=None):
    audio_file = next(iter(upload_audio.value))
    path = 'C:/Users/wanyi/Desktop/Uchicago/Deep Learning/Group Project/Fixed_Window/Reshma_final_code/Demo/Examples/'
    playsound.playsound(path+audio_file, True)

@button_play.on_click
def play_my_audio(b):
    listen_to_speech()
    
tab1 = HBox(children=[upload_audio, 
                      button_play, 
                      drop_gender,
                      drop_age,
                      button_predict])

In [None]:
tab = widgets.Tab(children=[tab1])#, tab2
tab.set_title(0, 'Load and Predict')
tab.set_title(1, 'Record and Predict')
VBox(children=[tab])