# Predict Data

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.layers.recurrent import GRU
from keras.models import Sequential, load_model
from keras.regularizers import l2
from keras.layers import Dense, BatchNormalization

Using TensorFlow backend.


### Load the CSV

In [2]:
data = pd.read_csv('datasets/mt.csv', sep=',')
print("Columns:", data.columns)
print("Initially", len(data), "rows")

data.loc[data['vowel_type'].isna(), 'vowel_type'] = 'none'
data.head()



Columns: Index(['speaker', 'word', 'token', 'pre_segment', 'fol_segment', 'vowelR',
       'vowel_type', 'note', 'time', 'norm_time', 'c1', 'c2', 'c3', 'c4', 'c5',
       'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12'],
      dtype='object')
Initially 2083105 rows


Unnamed: 0,speaker,word,token,pre_segment,fol_segment,vowelR,vowel_type,note,time,norm_time,...,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12
0,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,DH,#,ER0,none,linking,2.1715,0.0101,...,-1022.492295,-674.64585,-374.559604,-33.608998,43.539283,-100.527786,-93.679401,-444.218747,-525.123463,-660.706331
1,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,DH,#,ER0,none,linking,2.173,0.0202,...,-1058.318886,-618.081041,-392.756742,-73.85194,9.402799,-70.048259,-76.092667,-485.080975,-514.300174,-690.81372
2,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,DH,#,ER0,none,linking,2.1745,0.0303,...,-1111.359326,-607.571292,-404.454693,-32.069785,63.086838,-111.803538,-75.985507,-442.519672,-538.917883,-665.585262
3,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,DH,#,ER0,none,linking,2.1761,0.0404,...,-986.829893,-567.632984,-426.516657,-14.907034,51.907415,-89.423728,-64.152095,-500.297254,-479.800648,-628.009562
4,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,DH,#,ER0,none,linking,2.1776,0.0505,...,-1063.674469,-641.366517,-365.899597,-17.289025,26.717821,-107.038116,-52.297277,-479.613893,-519.625957,-662.952884


### Encode Categorical Variables

In [3]:
# make categorical variables numeric
encoder = LabelEncoder()
# encoder.classes_ = numpy.load('arrays/style_classes.npy')
# data['speech_style'] = encoder.fit_transform(data['style'].astype(str))

encoder.classes_ = np.load('arrays/preseg_classes.npy')
print(encoder.classes_)
data['pre_segment'] = encoder.transform(data['pre_segment'].astype(str))

encoder.classes_ = np.load('arrays/folseg_classes.npy')
data['fol_segment'] = encoder.transform(data['fol_segment'].astype(str))

encoder.classes_ = np.load('arrays/vowelR_classes.npy')
data['vowelR'] = encoder.transform(data['vowelR'].astype(str))

encoder.classes_ = np.load('arrays/vowel_type_classes.npy')
data['vowel_type'] = encoder.transform(data['vowel_type'].astype(str))
data.head(21)

['AW1' 'AY1' 'B' 'CH' 'D' 'DH' 'ER0' 'F' 'G' 'H' 'HH' 'IY0' 'IY1' 'K' 'L'
 'M' 'N' 'NG' 'OW1' 'P' 'R' 'S' 'SH' 'T' 'UW1' 'V' 'W' 'Y' 'Z' 'nan' 'sp']


Unnamed: 0,speaker,word,token,pre_segment,fol_segment,vowelR,vowel_type,note,time,norm_time,...,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12
0,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1715,0.0101,...,-1022.492295,-674.64585,-374.559604,-33.608998,43.539283,-100.527786,-93.679401,-444.218747,-525.123463,-660.706331
1,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.173,0.0202,...,-1058.318886,-618.081041,-392.756742,-73.85194,9.402799,-70.048259,-76.092667,-485.080975,-514.300174,-690.81372
2,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1745,0.0303,...,-1111.359326,-607.571292,-404.454693,-32.069785,63.086838,-111.803538,-75.985507,-442.519672,-538.917883,-665.585262
3,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1761,0.0404,...,-986.829893,-567.632984,-426.516657,-14.907034,51.907415,-89.423728,-64.152095,-500.297254,-479.800648,-628.009562
4,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1776,0.0505,...,-1063.674469,-641.366517,-365.899597,-17.289025,26.717821,-107.038116,-52.297277,-479.613893,-519.625957,-662.952884
5,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1791,0.0606,...,-1065.472565,-620.11116,-423.981473,-57.3789,68.83202,-69.322706,-110.514901,-480.604908,-516.0968,-670.0008
6,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1806,0.0707,...,-1065.865377,-675.983234,-369.43188,-42.17456,32.678901,-78.833412,-83.945704,-499.744672,-488.783056,-680.203198
7,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1821,0.0808,...,-1085.37245,-640.074493,-407.788671,-8.526462,30.385037,-86.137435,-108.084595,-449.959978,-523.291645,-684.036196
8,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1836,0.0909,...,-959.284014,-617.434322,-421.071931,0.850639,33.118351,-141.286203,-84.906223,-459.382941,-520.984107,-621.123415
9,100_2016_9_5_10_ngTMg,weather,100_2016_9_5_10_ngTMg_recording10_1,5,0,7,2,linking,2.1852,0.101,...,-994.788822,-679.964733,-417.837474,23.585294,37.165498,-129.098794,-79.556766,-449.219367,-526.555203,-675.400315


### Process Time Series

In [4]:
token_length = 100
num_features = 16 # 17

def windows(data):
    start = 0
    end = 0
    this_token = data.iloc[end]['token']
    while start < len(data):
        while end < len(data) and data.iloc[end]['token'] == this_token:
            end += 1
        yield start, end
        start = end
        if end >= len(data):
            break
        this_token = data.iloc[end]['token']
        
def get_speech_tokens_without_labels(data, window_size=21, num_features=3):
    new_data = pd.DataFrame()  # dataframe to store categorical data
    tokens = np.empty((0, window_size, num_features))
    for start, end in windows(data):
        
        c1 = data.iloc[start:end]['c1'].astype(float)
        c2 = data.iloc[start:end]['c2'].astype(float)
        c3 = data.iloc[start:end]['c3'].astype(float)
        c4 = data.iloc[start:end]['c4'].astype(float)
        c5 = data.iloc[start:end]['c5'].astype(float)
        c6 = data.iloc[start:end]['c6'].astype(float)
        c7 = data.iloc[start:end]['c7'].astype(float)
        c8 = data.iloc[start:end]['c8'].astype(float)
        c9 = data.iloc[start:end]['c9'].astype(float)
        c10 = data.iloc[start:end]['c10'].astype(float)
        c11 = data.iloc[start:end]['c11'].astype(float)
        c12 = data.iloc[start:end]['c12'].astype(float)
        
#         style = np.array(data.iloc[start:end]['speech_style']).astype(int)
        pre_seg = np.array(data.iloc[start:end]['pre_segment']).astype(int)
        fol_seg = np.array(data.iloc[start:end]['fol_segment']).astype(int)
        vowelR = np.array(data.iloc[start:end]['vowelR']).astype(int)
        vowel_type = np.array(data.iloc[start:end]['vowel_type']).astype(int)
        
        if 0 < len(c1) < window_size:
            difference = window_size - len(c1)
            padding = np.array([0] * difference)
            c1 = np.hstack([padding, c1])
            c2 = np.hstack([padding, c2])
            c3 = np.hstack([padding, c3])
            c4 = np.hstack([padding, c4])
            c5 = np.hstack([padding, c5])
            c6 = np.hstack([padding, c6])
            c7 = np.hstack([padding, c7])
            c8 = np.hstack([padding, c8])
            c9 = np.hstack([padding, c9])
            c10 = np.hstack([padding, c10])
            c11 = np.hstack([padding, c11])
            c12 = np.hstack([padding, c12])
            
#             style = np.hstack([padding, style])
            pre_seg = np.hstack([padding, pre_seg])
            fol_seg = np.hstack([padding, fol_seg])
            vowelR = np.hstack([padding, vowelR])
            vowel_type = np.hstack([padding, vowel_type])
            
        
        if len(c1) == window_size:
            # get continuous data
            feature_stack = np.dstack([c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12,
                                      pre_seg, fol_seg, vowelR, vowel_type]) # , style])
            tokens = np.vstack([tokens, feature_stack])

    return tokens

### Perform Preprocesssing and Reshape the Data

In [5]:
segments = get_speech_tokens_without_labels(data, token_length, num_features)
print("Segment shape:", segments.shape)
X = segments.reshape((len(segments), token_length, num_features))
print("X shape:", X.shape)

Segment shape: (21141, 100, 16)
X shape: (21141, 100, 16)


### Load the Model

In [6]:
num_classes = 2
token_length = 100
num_features = 16

def regularization_constructor(alpha):
    if alpha:
        return l2(alpha)
    else:
        return None

def create_model(dropout=0, recurrent_dropout=0, recurrent_num_hidden=50, num_hidden=200,
                 gru_activation='tanh', recurrent_activation='tanh', activation='relu',
                 rec_l2=0.01, ker_l2=None, bias_l2=None, act_l2=0.01, 
                 dense_ker_l2=0.01, dense_bias_l2=None, dense_act_l2=None, 
                 verbose=False):
    rec_l2 = regularization_constructor(rec_l2)
    ker_l2 = regularization_constructor(ker_l2)
    bias_l2 = regularization_constructor(bias_l2)
    act_l2 = regularization_constructor(act_l2)
    dense_ker_l2 = regularization_constructor(dense_ker_l2)
    dense_bias_l2 = regularization_constructor(dense_bias_l2)
    dense_act_l2 = regularization_constructor(dense_act_l2)
    
    model = Sequential()
    model.add(BatchNormalization(input_shape=(token_length, num_features,)))
    model.add(GRU(units=recurrent_num_hidden, return_sequences=False, recurrent_dropout=recurrent_dropout,
                        dropout=dropout, activation=gru_activation, recurrent_activation=recurrent_activation,
                        recurrent_regularizer=rec_l2,
                        kernel_regularizer=ker_l2,
                        bias_regularizer=bias_l2,
                        activity_regularizer=act_l2,
                        input_shape=(token_length, num_features,)))
    model.add(Dense(num_hidden, activation=activation,
                   kernel_regularizer=dense_ker_l2,
                   bias_regularizer=dense_bias_l2,
                   activity_regularizer=dense_act_l2))
    model.add(Dense(1, activation='sigmoid'))
    if verbose:
        print(model.summary())
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [7]:
model = create_model()
model.load_weights("results/weights.best.mfcc.hdf5")

In [8]:
y_prob = model.predict(X)
y_pred = np.round(y_prob)

### Write Prediction to CSV

In [9]:
data_to_write = pd.DataFrame()
data_to_write['token'] = pd.read_csv('datasets/mt.csv', sep=',')['token'].unique()

In [10]:
print("Loaded rows:", len(data_to_write))
print("Predicted rows:", len(y_pred))
print("Probability rows:", len(y_prob))

Loaded rows: 21141
Predicted rows: 21141
Probability rows: 21141


In [11]:
data_to_write['rhoticity_prediction'] = y_pred
data_to_write['rhoticity_probability'] = y_prob

In [12]:
data_to_write.to_csv('results/mt_prediction.csv', index=False)