fbank as feature

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import wave
from scipy import signal
from scipy.io import wavfile
import IPython.display as ipd
import librosa
import librosa.display

from python_speech_features import fbank
from audio2numpy import open_audio
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# load raw data
data=pd.read_csv("archive/speakers_all.csv",index_col='speakerid')
data = data.iloc[:,:8]
data = data[data['file_missing?']==False]
data

INPUT_DIR = 'archive/recordings/recordings/'
OUTPUT_DIR = 'results/'
parent_list = os.listdir(INPUT_DIR)
print(len(parent_list))

print(set(data['filename']+'.mp3') - set(parent_list)) # find contries whose recordings are misssing

data = data.drop(data[data['filename']=='sinhalese1'].index)
data = data.drop(data[data['filename']=='nicaragua'].index)
data

# 4 classification
English = data[data['native_language']=='english']
French = data[data['native_language']=='french']
Spanish= data[data['native_language']=='spanish']
Arabic = data[data['native_language']=='arabic']
dataSub = English.append(French).append(Spanish).append(Arabic)
dataSub

def get_fbank(INPUT_DIR):
    parent_list = dataSub['filename']+ '.mp3'
    # 2138 recordings each has 20 3990-dimension Fbanks (dim depends on duration)
    fbank_feat = np.zeros((len(parent_list),20,3990))
    i=0  
    for file in parent_list[:]:
        f_name = str(INPUT_DIR+file)
        y, sr = librosa.load(f_name,sr=None,duration=20)   # duration 20s, uses the native sampling rate
        fb = fbank(y,sr/2,nfft=1103,nfilt=20)[0].T
        
        if len(fb[1]) < 3990:
            offset = 3990 - len(fb[1]) # padding starting point
            fb= np.pad(fb,((0,0),(offset,0)), 'constant')
   
        if len(fb[1]) > 3990:
            fb = fb[:,:3990]    # extract the first 3990 dims
        
        fbank_feat[i,:,:]= fb
        i+=1
        
    return fbank_feat

fbank_feat = get_fbank(INPUT_DIR)
np.save("Results/fbank_feat4.npy",fbank_feat)


2138
{'sinhalese1.mp3', 'nicaragua.mp3'}


  return f(*args, **kwargs)


In [7]:
X = np.load("Results/fbank_feat4.npy")
print(np.shape(X))

(906, 20, 3990)


In [8]:
#PCA 
from sklearn.decomposition import PCA
proj = np.zeros(shape=(906,20,3990))

for i in range(906):
    X1 = X[i,:,:]
    pca = PCA(whiten=True)
    X_pca = pca.fit_transform(X1)
    proj[i,:,:] = pca.inverse_transform(X_pca)

X = proj
print(np.shape(X))

(906, 20, 3990)


# 4 classification

In [9]:
y_accent = dataSub['native_language']

encoder1 = LabelEncoder()
encoder1.fit(y_accent)
y_accent_ = encoder1.transform(y_accent)
y_accent_ = to_categorical(np.array(y_accent_),dtype='float32')
print(np.shape(y_accent_))  #4 categories.

(906, 4)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_accent_train, y_accent_test = train_test_split(X, y_accent_, test_size=0.2, random_state=10)

mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std
X_train= np.expand_dims(X_train, axis=3)
X_test= np.expand_dims(X_test, axis=3)

#X_train, X_val, y_accent_train, y_accent_val = train_test_split(X_train, y_accent_train, test_size=0.15, random_state=10)
#X_val = (X_val - mean)/std
#X_val= np.expand_dims(X_val, axis=3)

print ('Train set:', X_train.shape,  y_accent_train.shape)
print ('Test set:', X_test.shape,  y_accent_test.shape)
#print ('validation set:', X_val.shape,  y_accent_val.shape)


Train set: (724, 20, 3990, 1) (724, 4)
Test set: (182, 20, 3990, 1) (182, 4)


In [12]:
newX_train = X_train.reshape(724, 79800)
newX_test = X_test.reshape(182, 79800)

y_label_train = y_accent_train.argmax(axis=1)
y_label_test =  y_accent_test.argmax(axis=1)

print ('Train set:', newX_train.shape,  y_label_train.shape)
print ('Test set:', newX_test.shape,  y_label_test.shape)

Train set: (724, 79800) (724,)
Test set: (182, 79800) (182,)


In [13]:
# Feature Scaling for Classification
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(newX_train)

newX_train = scaler.transform(newX_train)
newX_test = scaler.transform(newX_test)

# Training and Predicting for Classification
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()
classifier.fit(newX_train, y_label_train)
y_pred = classifier.predict(newX_test)

# print accuracy
acc =  classifier.score(newX_test, y_label_test)
print(acc) 

0.6593406593406593
