In [32]:
# importing packages
import numpy as np
import pandas as pd
from sklearn.externals import joblib
import os

from sklearn.preprocessing import LabelEncoder

from sklearn.mixture import GaussianMixture

In [33]:
# reading train data

df_mfcc = pd.read_hdf("./features_train/mfcc/timit.hdf")

df_mfcc_delta = pd.read_hdf("./features_train/mfcc_delta/timit.hdf")

df_mfcc_delta_delta = pd.read_hdf("./features_train/mfcc_delta_delta/timit.hdf")


In [34]:
df_mfcc.head()

Unnamed: 0,features,labels
0,"[8.08968863199349, -21.117180342947794, -23.47...",sil
1,"[7.683770167702839, -21.623345723164935, -22.4...",sil
2,"[6.932491296499403, -19.687886551214262, -14.1...",sil
3,"[6.699583709386875, -23.57692698373293, -7.297...",sil
4,"[6.7738633678071825, -30.036342939606634, -6.8...",sil


In [35]:
len(df_mfcc['features'][0])

13

In [36]:
def preprocess(df):
    
    # taking out features and lable column
    X_train = df['features'].tolist()  # because gmm needs sequence as an argument
    y_train = df['labels'].tolist()
    
    # label encoding for phonemes 
    lb = LabelEncoder()
    df['labels_encoded'] = lb.fit_transform(df['labels'])
    
    # dropping a column of labels (phonemes)
    df = df.drop(columns=['labels'])
    
    return df

In [37]:
def remove_coef(df,var):
    
    ''' this function removes corresponding coefficients on preprocessed data frame'''
    
    # first converting features columnt to a list and then to a numpy array
    features=np.array(df["features"].tolist())
    
    if(var == 0):
        features=np.delete(features, [0], axis=1)
    elif(var == 1):
        features=np.delete(features, [0,14], axis=1)
    else:
        features=np.delete(features, [0,14,27], axis=1)

    # putting back the column of features after removing co eff
    df['features'] = features.tolist()
    
    return df

In [38]:
# df with energy coefficients
df_mfcc_pre = preprocess(df_mfcc)
df_mfcc_delta_pre = preprocess(df_mfcc_delta)
df_mfcc_delta_delta_pre = preprocess(df_mfcc_delta_delta)

df1 = df_mfcc_pre.copy()
df2 = df_mfcc_delta_pre.copy()
df3 = df_mfcc_delta_delta_pre.copy()

#print(len(df_mfcc_pre['features'][0]))
#print(len(df_mfcc_delta_pre['features'][0]))

# df without energy coefficients : removing 

df_mfcc_pre_wo = remove_coef(df1 , 0)     # 0 means simple mfcc
df_mfcc_delta_pre_wo = remove_coef(df2 , 1)
df_mfcc_delta_delta_pre_wo = remove_coef(df3 , 2)

#print(len(df_mfcc_pre['features'][0]))
#print(len(df_mfcc_delta_pre['features'][0]))


In [39]:
#len(df_mfcc_pre['features'][0])

In [40]:
# taking out list of labels into unique labels 
unique_labels = np.unique(df_mfcc_pre['labels_encoded'])
unique_labels

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39], dtype=int64)

In [41]:
# mfcc , mfcc_delta , mfcc_delta_delta  with and without 6 dataframes with features

# everything with 64 components except mfcc without variant with 2 to 256

# function takes dataframe and list for no of components (mk two lists)  

# according to dataframe number create directory , use if else here 


def train_store_models(df , df_num , comp_list):
    
    ''' this function train the models and store it for every df for comp_list '''
    
    if(df_num==0):
        print("Training started for mfcc with co-efficient")
    elif(df_num==1):
        print("Training started for mfcc-delta with co-efficient")
    elif(df_num==2):
        print("Training started for mfcc-delta-delta with co-efficient")
    elif(df_num==3):
        print("Training started for mfcc without co-efficient")
    elif(df_num==4):
        print("Training started for mfcc-delta without co-efficient")
    else:
        print("Training started for mfcc-delta-delta without co-efficient")
    
    # iterating through comp_list
    for j in comp_list:
        
        n_components = j    # n_comp for GMM
        print("training started for n_components" , n_components)
    
        # for each phoneme 
        for i in unique_labels:

            print("training started for label" , i)

            temp_df = df.loc[df['labels_encoded'] == i]
            #print(temp_df.head())

            X_train = temp_df['features'].tolist()

            model = GaussianMixture(n_components, covariance_type='diag')
            model.fit(X_train)
            
            if(df_num==0):
                directory = "./models/mfcc_with_" + str(n_components)
            elif(df_num==1):
                directory = "./models/mfcc_delta_with_" + str(n_components)
            elif(df_num==2):
                directory = "./models/mfcc_delta_delta_with_" + str(n_components)
            elif(df_num==3):
                directory = "./models/mfcc_without_" + str(n_components)
            elif(df_num==4):
                directory = "./models/mfcc_delta_without_" + str(n_components)
            else:
                directory = "./models/mfcc_delta_delta_without_" + str(n_components)
             
            if not os.path.exists(directory):
                os.makedirs(directory)

            filename = directory + "/" + str(i) + ".pkl"
            joblib.dump(model, filename)


In [42]:
comp_list1 = [64]
comp_list2 = [2,4,8,16,32,64,128,256]

#comp_list1 = [2]
#comp_list2 = [2,4]

for df_num in range(0,6):
    
    if(df_num==0):
        df = df_mfcc_pre
    elif(df_num==1):
        df = df_mfcc_delta_pre
    elif(df_num==2):
        df = df_mfcc_delta_delta_pre    
    elif(df_num==3):
        df = df_mfcc_pre_wo
    elif(df_num==4):
        df = df_mfcc_delta_pre_wo
    else:
        df = df_mfcc_delta_delta_pre_wo
        
    if(df_num == 3):
        comp_list = comp_list2
    else:
        comp_list = comp_list1
        
    train_store_models(df,df_num,comp_list)

Training started for mfcc with co-efficient
training started for n_components 64
training started for label 0
training started for label 1
training started for label 2
training started for label 3
training started for label 4
training started for label 5
training started for label 6
training started for label 7
training started for label 8
training started for label 9
training started for label 10
training started for label 11
training started for label 12
training started for label 13
training started for label 14
training started for label 15
training started for label 16
training started for label 17
training started for label 18
training started for label 19
training started for label 20
training started for label 21
training started for label 22
training started for label 23
training started for label 24
training started for label 25
training started for label 26
training started for label 27
training started for label 28
training started for label 29
training started for label 30

training started for label 21
training started for label 22
training started for label 23
training started for label 24
training started for label 25
training started for label 26
training started for label 27
training started for label 28
training started for label 29
training started for label 30
training started for label 31
training started for label 32
training started for label 33
training started for label 34
training started for label 35
training started for label 36
training started for label 37
training started for label 38
training started for label 39
training started for n_components 32
training started for label 0
training started for label 1
training started for label 2
training started for label 3
training started for label 4
training started for label 5
training started for label 6
training started for label 7
training started for label 8
training started for label 9
training started for label 10
training started for label 11
training started for label 12
training star