# Lost in Translation
## Classifying Speech Language using Deep Learning

### Part 4 - Tensorflow

#### Import library

In [40]:
import os
import numpy as np
import pandas as pd

import IPython.display as ipd
import librosa
import librosa.display
from scipy.io import wavfile

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Flatten

#### Load data

In [3]:
df = pd.read_pickle('df_all.pkl')

In [4]:
df

Unnamed: 0,features,label
0,"[-328.1553, 33.20999, -14.629965, 25.129225, -...",chinese
1,"[-317.4969, 33.81309, -23.309122, 31.999235, -...",chinese
2,"[-336.15854, 35.530407, -25.493, 18.644785, -2...",chinese
3,"[-333.79565, 31.953947, -13.206911, 22.866724,...",chinese
4,"[-354.22464, 45.220486, -17.17353, 27.299313, ...",chinese
...,...,...
64288,"[-351.8527, 107.06704, -3.2553566, 47.231853, ...",spanish
64289,"[-367.3681, 112.8195, -10.63875, 28.11285, 0.3...",spanish
64290,"[-359.3411, 113.6945, -2.7332597, 25.764277, 2...",spanish
64291,"[-327.3389, 131.14326, -6.5025754, 38.206024, ...",spanish


#### Get feature matrix

In [5]:
# use label encoder to convert the language labels to numeric target column
encoder = LabelEncoder()
encoder.fit(df['label'])
labels = encoder.classes_
y = encoder.transform(df['label'])

In [6]:
# Convert feature matrix into a keras-friendly format
X = np.array(df['features'].tolist())

#### Train_test split

In [10]:
seed = 99
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1, random_state = seed, shuffle = True)

#### Build a CNN

In [28]:
# reshape the input into "images"
X_train = X_train.reshape(X_train.shape[0], 10, 4, 1)
X_test = X_test.reshape(X_test.shape[0], 10, 4, 1)

In [33]:
# Check the shape of the inputs and outputs
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(57863, 10, 4, 1)
(6430, 10, 4, 1)
(57863,)
(6430,)


In [36]:
y_train

array([8, 8, 8, ..., 8, 4, 4])

In [34]:
model = Sequential()
model.add(Flatten(input_shape=(10, 4)))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3ad1458350>

In [35]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print('\nTest accuracy {:5.2f}%'.format(100*test_acc))


Test accuracy 99.88%


In [37]:
# model.save('../saved_model/20200602/') 

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../saved_model/20200602/assets


# Prediction test using personal recording

In [38]:
import os
import numpy as np
import pandas as pd
import IPython.display as ipd
import librosa
import librosa.display
from scipy.io import wavfile

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name) 
        X = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        X_scaled = np.mean(X.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
     
    return X_scaled

In [57]:
sample_path = '../test/test2.wav'
ipd.Audio(sample_path)

In [58]:
test_case = extract_features('../test/test2.wav')
test_case = test_case.reshape(1, 10, 4, 1)

In [59]:
predicted = np.argmax(model.predict(test_case))
print(labels[predicted])

finnish
