In [1]:
# importing packages
import numpy as np
import pandas as pd
from sklearn.externals import joblib
import os

from sklearn.preprocessing import LabelEncoder

from sklearn.mixture import GaussianMixture

In [2]:
# reading train data

df_mfcc = pd.read_hdf("./features_test/mfcc/timit.hdf")

df_mfcc_delta = pd.read_hdf("./features_test/mfcc_delta/timit.hdf")

df_mfcc_delta_delta = pd.read_hdf("./features_test/mfcc_delta_delta/timit.hdf")


In [3]:
df_mfcc.head()

Unnamed: 0,features,labels
0,"[8.379582508942779, -28.719687003716288, -25.6...",sil
1,"[8.48626909310625, -26.586026014005547, -20.91...",sil
2,"[8.657363111189092, -26.98079577238136, -21.21...",sil
3,"[8.709860073269825, -27.500228508437047, -21.5...",sil
4,"[8.732931832586347, -29.179084999183683, -23.5...",sil


In [4]:
len(df_mfcc['features'][0])

13

In [5]:
def preprocess(df):
    
    # taking out features and lable column
    X_test = df['features'].tolist()          # because gmm needs sequence as an argument
    y_test = df['labels'].tolist()
    
    # label encoding for phonemes 
    lb = LabelEncoder()
    df['labels_encoded'] = lb.fit_transform(df['labels'])
    
    # dropping a column of labels (phonemes)
    df = df.drop(columns=['labels'])
    
    return df

In [6]:
def remove_coef(df,var):
    
    ''' this function removes corresponding coefficients on preprocessed data frame'''
    
    # first converting features columnt to a list and then to a numpy array
    features=np.array(df["features"].tolist())
    
    if(var == 0):
        features=np.delete(features, [0], axis=1)
    elif(var == 1):
        features=np.delete(features, [0,14], axis=1)
    else:
        features=np.delete(features, [0,14,27], axis=1)

    # putting back the column of features after removing co eff
    df['features'] = features.tolist()
    
    return df

In [7]:
# df with energy coefficients
df_mfcc_pre = preprocess(df_mfcc)
df_mfcc_delta_pre = preprocess(df_mfcc_delta)
df_mfcc_delta_delta_pre = preprocess(df_mfcc_delta_delta)

df1 = df_mfcc_pre.copy()
df2 = df_mfcc_delta_pre.copy()
df3 = df_mfcc_delta_delta_pre.copy()

#print(len(df_mfcc_pre['features'][0]))
#print(len(df_mfcc_delta_pre['features'][0]))

# df without energy coefficients : removing 

df_mfcc_pre_wo = remove_coef(df1 , 0)     # 0 means simple mfcc
df_mfcc_delta_pre_wo = remove_coef(df2 , 1)
df_mfcc_delta_delta_pre_wo = remove_coef(df3 , 2)

#print(len(df_mfcc_pre['features'][0]))
#print(len(df_mfcc_delta_pre['features'][0]))


In [8]:
#len(df_mfcc_pre['features'][0])

In [9]:
# taking out list of labels into unique labels 
unique_labels = np.unique(df_mfcc_pre['labels_encoded'])
unique_labels

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39], dtype=int64)

In [10]:
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score

In [11]:
# this fun tests only for all with 64 components
# another fun for mfcc with 2 to 256 coponents

def test_models(df_test,df_num):
    
    if(df_num==0):
        print("Testing started for mfcc with co-efficient")
    elif(df_num==1):
        print("Testing started for mfcc-delta with co-efficient")
    elif(df_num==2):
        print("Testing started for mfcc-delta-delta with co-efficient")
    elif(df_num==3):
        print("Testing started for mfcc without co-efficient")
    elif(df_num==4):
        print("Testing started for mfcc-delta without co-efficient")
    else:
        print("Testing started for mfcc-delta-delta without co-efficient")
        
    
    # taking out features and labels column for test set
    X_test = df_test['features'].tolist()
    y_test = df_test['labels_encoded'].tolist()
        
    list_of_lists = []     # this ll contain 40 rows of length X_test , predicted prob for each phoneme / model trained 

    # for each phoneme load the corresponding model and predict
    for i in unique_labels:
        print("testing started for label" , i)

        if(df_num==0):
            loaded_model = joblib.load("./models/mfcc_with_64/" + str(i) + ".pkl")
        elif(df_num==1):
            loaded_model = joblib.load("./models/mfcc_delta_with_64/" + str(i) + ".pkl")
        elif(df_num==2):
            loaded_model = joblib.load("./models/mfcc_delta_delta_with_64/" + str(i) + ".pkl")
        elif(df_num==3):
            loaded_model = joblib.load("./models/mfcc_without_64/" + str(i) + ".pkl")
        elif(df_num==4):
            loaded_model = joblib.load("./models/mfcc_delta_without_64/" + str(i) + ".pkl")
        else:
            loaded_model = joblib.load("./models/mfcc_delta_delta_without_64/" + str(i) + ".pkl")
             
        
        prob_list = loaded_model.score_samples(X_test)  # now for X_test we are predicting prob for all instances of X_test using ith model
        #print(prob_list)                             # do this for 40 models and apply MAP rule to get the classfn result at frame level

        list_of_lists.append(prob_list) 
        
    # calculating accuracy
    
    ll = np.asarray(list_of_lists)
    
    max_list = ll.argmax(axis = 0)  # we need maximum value column wise and we need index of it to map it to phoneme / class
    
    # printing acc
    if(df_num==0):
        print("Accuracy for mfcc with co-efficient" , accuracy_score(max_list,y_test))
    elif(df_num==1):
        print("Accuracy for mfcc-delta with co-efficient" , accuracy_score(max_list,y_test))
    elif(df_num==2):
        print("Accuracy for mfcc-delta-delta with co-efficient" , accuracy_score(max_list,y_test))
    elif(df_num==3):
        print("Accuracy for mfcc without co-efficient" , accuracy_score(max_list,y_test))
    elif(df_num==4):
        print("Accuracy for mfcc-delta without co-efficient" , accuracy_score(max_list,y_test))
    else:
        print("Accuracy for mfcc-delta-delta without co-efficient" , accuracy_score(max_list,y_test))

In [14]:
# calling function

for df_num in range(0,6):
    
    if(df_num==0):
        df = df_mfcc_pre
    elif(df_num==1):
        df = df_mfcc_delta_pre
    elif(df_num==2):
        df = df_mfcc_delta_delta_pre    
    elif(df_num==3):
        df = df_mfcc_pre_wo
    elif(df_num==4):
        df = df_mfcc_delta_pre_wo
    else:
        df = df_mfcc_delta_delta_pre_wo
    
    test_models(df,df_num)

Testing started for mfcc with co-efficient
testing started for label 0
testing started for label 1
testing started for label 2
testing started for label 3
testing started for label 4
testing started for label 5
testing started for label 6
testing started for label 7
testing started for label 8
testing started for label 9
testing started for label 10
testing started for label 11
testing started for label 12
testing started for label 13
testing started for label 14
testing started for label 15
testing started for label 16
testing started for label 17
testing started for label 18
testing started for label 19
testing started for label 20
testing started for label 21
testing started for label 22
testing started for label 23
testing started for label 24
testing started for label 25
testing started for label 26
testing started for label 27
testing started for label 28
testing started for label 29
testing started for label 30
testing started for label 31
testing started for label 32
testing st