In [1]:
import re
import os
import random
import numpy as np
import gc

In [2]:
train_file = open("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/train.txt","r")
train = train_file.read()
train = train.split('\n')
train = train[:-1]
print("Examples in test dataset:", len(train))
x_train = [data_example.split(' ')[0] for data_example in train]
y_train = [data_example.split(' ')[1] for data_example in train]

Examples in test dataset: 83194


In [3]:
x_train_copy = x_train.copy()
y_train_copy = y_train.copy()

In [4]:
x_train = x_train_copy[:83000]
y_train = y_train_copy[:83000]
x_test = x_train_copy[83000:]
y_test = y_train_copy[83000:]

In [5]:
x_train[0]

'LEMIEUX'

In [6]:
def get_phonetic_dict(x, y):
    phonetic_dict = {}
    for i in range(len(x)):
        if x[i] not in phonetic_dict:
            phonetic_dict[x[i]] = []
        phonetic_dict[x[i]].append(y[i])
    return phonetic_dict

In [7]:
phonetic_dict = get_phonetic_dict(x_train, y_train)
example_count = np.sum([len(prons) for _, prons in phonetic_dict.items()])

In [8]:
print("\n".join([k+' --> '+phonetic_dict[k][0] for k in random.sample(list(phonetic_dict.keys()), 10)]))
print('\nAfter cleaning, the dictionary contains %s words and %s pronunciations (%s are alternate pronunciations).' % 
      (len(phonetic_dict), example_count, (example_count-len(phonetic_dict))))

KOEHN --> K_OW_N
BEAVIS --> B_IY_V_AH_S
MOTHERSHEAD --> M_AH_DH_ER_Z_HH_EH_D
HEATEDLY --> HH_IY_T_IH_D_L_IY
VICTORIOUS --> V_IH_K_T_AO_R_IY_AH_S
BENJAMIN'S --> B_EH_N_JH_AH_M_AH_N_Z
CABE --> K_EY_B
SILVIUS --> S_IH_L_V_IY_IH_S
WRINKLING --> R_IH_NG_K_AH_L_IH_NG
JEANCOURT --> JH_IY_N_K_AO_R_T

After cleaning, the dictionary contains 83000 words and 83000 pronunciations (0 are alternate pronunciations).


In [9]:
import string

START_PHONE_SYM = 's'
END_PHONE_SYM = 'e'


def char_list():
    allowed_symbols = ['']
    for word in x_train:
        for char in word:
            if char not in allowed_symbols:
                allowed_symbols.append(char)
    return allowed_symbols


def phone_list():
    phone_list = [START_PHONE_SYM, END_PHONE_SYM]
    for transcription in y_train:
        for phone in transcription.split('_'):
            if phone not in phone_list:
                phone_list.append(phone)
    return [''] + phone_list


def id_mappings_from_list(str_list):
    str_to_id = {s: i for i, s in enumerate(str_list)} 
    id_to_str = {i: s for i, s in enumerate(str_list)}
    return str_to_id, id_to_str


# Create character to ID mappings
char_to_id, id_to_char = id_mappings_from_list(char_list())

# Load phonetic symbols and create ID mappings
phone_to_id, id_to_phone = id_mappings_from_list(phone_list())

# Example:
print('Char to id mapping: \n', char_to_id)
print('Phone to id mapping: \n', phone_to_id)

Char to id mapping: 
 {'': 0, 'L': 1, 'E': 2, 'M': 3, 'I': 4, 'U': 5, 'X': 6, 'N': 7, 'D': 8, 'G': 9, 'S': 10, 'T': 11, 'R': 12, 'P': 13, 'K': 14, 'C': 15, 'O': 16, 'F': 17, 'A': 18, 'B': 19, 'H': 20, 'V': 21, 'Y': 22, 'W': 23, 'J': 24, "'": 25, 'Q': 26, 'Z': 27, '-': 28}
Phone to id mapping: 
 {'': 0, 's': 1, 'e': 2, 'L': 3, 'AH': 4, 'M': 5, 'Y': 6, 'UW': 7, 'AY': 8, 'N': 9, 'D': 10, 'IH': 11, 'NG': 12, 'S': 13, 'T': 14, 'R': 15, 'P': 16, 'K': 17, 'EH': 18, 'AA': 19, 'F': 20, 'ER': 21, 'EY': 22, 'AE': 23, 'Z': 24, 'G': 25, 'B': 26, 'SH': 27, 'V': 28, 'OW': 29, 'AO': 30, 'IY': 31, 'W': 32, 'HH': 33, 'JH': 34, 'CH': 35, 'TH': 36, 'AW': 37, 'OY': 38, 'UH': 39, 'ZH': 40, 'DH': 41}


In [10]:
CHAR_TOKEN_COUNT = len(char_to_id)
PHONE_TOKEN_COUNT = len(phone_to_id)


def char_to_1_hot(char):
    char_id = char_to_id[char]
    hot_vec = np.zeros((CHAR_TOKEN_COUNT))
    hot_vec[char_id] = 1.
    return hot_vec


def phone_to_1_hot(phone):
    phone_id = phone_to_id[phone]
    hot_vec = np.zeros((PHONE_TOKEN_COUNT))
    hot_vec[phone_id] = 1.
    return hot_vec

# Example:
print('"A" is represented by:\n', char_to_1_hot('A'), '\n-----')
print('"AH" is represented by:\n', phone_to_1_hot('AH'))
MAX_CHAR_SEQ_LEN = max([len(word) for word, _ in phonetic_dict.items()])
MAX_PHONE_SEQ_LEN = max([max([len(pron.split('_')) for pron in pronuns]) 
                         for _, pronuns in phonetic_dict.items()]
                       ) + 2  # + 2 to account for the start & end tokens we need to add

"A" is represented by:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.] 
-----
"AH" is represented by:
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [11]:
def dataset_to_1_hot_tensors():
    char_seqs = []
    phone_seqs = []
    
    for word, pronuns in phonetic_dict.items():
        word_matrix = np.zeros((MAX_CHAR_SEQ_LEN, CHAR_TOKEN_COUNT))
        for t, char in enumerate(word):
            word_matrix[t, :] = char_to_1_hot(char)
        for pronun in pronuns:
            pronun_matrix = np.zeros((MAX_PHONE_SEQ_LEN, PHONE_TOKEN_COUNT))
            phones = [START_PHONE_SYM] + pronun.split('_') + [END_PHONE_SYM]
            for t, phone in enumerate(phones):
                pronun_matrix[t,:] = phone_to_1_hot(phone)
                
            char_seqs.append(word_matrix)
            phone_seqs.append(pronun_matrix)
    
    return np.array(char_seqs), np.array(phone_seqs)
            

char_seq_matrix, phone_seq_matrix = dataset_to_1_hot_tensors()        
print('Word Matrix Shape: ', char_seq_matrix.shape)
print('Pronunciation Matrix Shape: ', phone_seq_matrix.shape)

Word Matrix Shape:  (83000, 34, 29)
Pronunciation Matrix Shape:  (83000, 34, 42)


In [12]:
def dataset_for_embeddings():
    char_seqs = []
    phone_seqs = []
    
    for word,pronuns in phonetic_dict.items():
        word_matrix = np.zeros((MAX_CHAR_SEQ_LEN))
        for t,char in enumerate(word):
            word_matrix[t] = char_to_id[char]
        for pronun in pronuns:
            pronun_matrix = np.zeros((MAX_PHONE_SEQ_LEN))
            phones = [START_PHONE_SYM] + pronun.split('_') + [END_PHONE_SYM]
            for t, phone in enumerate(phones):
                pronun_matrix[t] = phone_to_id[phone]
                
            char_seqs.append(word_matrix)
            phone_seqs.append(pronun_matrix)
    
    return np.array(char_seqs), np.array(phone_seqs)

            
char_emb_matrix, phone_emb_matrix = dataset_for_embeddings()        

print('Embedding Word Matrix Shape: ', char_emb_matrix.shape)
print('Embedding Phoneme Matrix Shape: ', phone_emb_matrix.shape)

Embedding Word Matrix Shape:  (83000, 34)
Embedding Phoneme Matrix Shape:  (83000, 34)


In [13]:
phone_seq_matrix_decoder_output = np.pad(phone_seq_matrix,((0,0),(0,1),(0,0)), mode='constant')[:,1:,:]

In [14]:
from sklearn.model_selection import train_test_split
TEST_SIZE = 0

(char_input_train, char_input_test, 
 phone_input_train, phone_input_test, 
 phone_output_train, phone_output_test) = train_test_split(
    char_seq_matrix, phone_seq_matrix, phone_seq_matrix_decoder_output, 
    test_size=TEST_SIZE, random_state=42)

(emb_char_input_train, emb_char_input_test, 
 emb_phone_input_train, emb_phone_input_test) = train_test_split(
    char_emb_matrix, phone_emb_matrix, test_size=TEST_SIZE, random_state=42)

#### MODEL

In [15]:
from keras.models import Model
from keras.activations import softmax
from keras.layers import Input, LSTM, Dense, Dropout, Embedding,Activation, Bidirectional, Concatenate, Permute, Dot, Multiply, Reshape, RepeatVector, Lambda, Flatten
def attention_model(hidden_nodes = 256, emb_size = 256):
    # Attention Mechanism Layers
    attn_repeat = RepeatVector(MAX_CHAR_SEQ_LEN)
    attn_concat = Concatenate(axis=-1)
    attn_dense1 = Dense(128, activation="tanh")
    attn_dense2 = Dense(1, activation="relu")
    attn_softmax = Lambda(lambda x: softmax(x,axis=1))
    attn_dot = Dot(axes = 1)
    
    def get_context(encoder_outputs, h_prev):
        h_prev = attn_repeat(h_prev)
        concat = attn_concat([encoder_outputs, h_prev])
        e = attn_dense1(concat)
        e = attn_dense2(e)
        attention_weights = attn_softmax(e)
        context = attn_dot([attention_weights, encoder_outputs])
        return context
    
    # Shared Components - Encoder
    char_inputs = Input(shape=(None,))
    char_embedding_layer = Embedding(CHAR_TOKEN_COUNT, emb_size, input_length=MAX_CHAR_SEQ_LEN)
    encoder = Bidirectional(LSTM(hidden_nodes, return_sequences=True, recurrent_dropout=0.2))
    
    # Shared Components - Decoder
    decoder = LSTM(hidden_nodes, return_state=True, recurrent_dropout=0.2)
    phone_embedding_layer = Embedding(PHONE_TOKEN_COUNT, emb_size)
    embedding_reshaper = Reshape((1,emb_size,))
    context_phone_concat = Concatenate(axis=-1)
    context_phone_dense = Dense(hidden_nodes*3, activation="relu")
    output_layer = Dense(PHONE_TOKEN_COUNT, activation='softmax')
    
    # Training Model - Encoder
    char_embeddings = char_embedding_layer(char_inputs)
    char_embeddings = Activation('relu')(char_embeddings)
    char_embeddings = Dropout(0.5)(char_embeddings)
    encoder_outputs = encoder(char_embeddings)
    
    # Training Model - Attention Decoder
    h0 = Input(shape=(hidden_nodes,))
    c0 = Input(shape=(hidden_nodes,))
    h = h0 # hidden state
    c = c0 # cell state
    
    phone_inputs = []
    phone_outputs = []
    
    for t in range(MAX_PHONE_SEQ_LEN):
        phone_input = Input(shape=(None,))
        phone_embeddings = phone_embedding_layer(phone_input)
        phone_embeddings = Dropout(0.5)(phone_embeddings)
        phone_embeddings = embedding_reshaper(phone_embeddings)
        
        context = get_context(encoder_outputs, h)
        phone_and_context = context_phone_concat([context, phone_embeddings])
        phone_and_context = context_phone_dense(phone_and_context)
        
        decoder_output, h, c = decoder(phone_and_context, initial_state = [h, c])
        decoder_output = Dropout(0.5)(decoder_output)
        phone_output = output_layer(decoder_output)
        
        phone_inputs.append(phone_input)
        phone_outputs.append(phone_output)
    
    training_model = Model(inputs=[char_inputs, h0, c0] + phone_inputs, outputs=phone_outputs)
    
   # Testing Model - Encoder
    testing_encoder_model = Model(char_inputs, encoder_outputs)

    # Testing Model - Decoder
    test_prev_phone_input = Input(shape=(None,))
    test_phone_embeddings = phone_embedding_layer(test_prev_phone_input)
    test_phone_embeddings = embedding_reshaper(test_phone_embeddings)
    
    test_h = Input(shape=(hidden_nodes,), name='test_h')
    test_c = Input(shape=(hidden_nodes,), name='test_c')
    
    test_encoding_input = Input(shape=(MAX_CHAR_SEQ_LEN, hidden_nodes*2,))
    test_context = get_context(test_encoding_input, test_h)
    test_phone_and_context = Concatenate(axis=-1)([test_context, test_phone_embeddings])
    test_phone_and_context = context_phone_dense(test_phone_and_context)
        
    test_seq, out_h, out_c = decoder(test_phone_and_context, initial_state = [test_h, test_c])
    test_out = output_layer(test_seq)
    
    testing_decoder_model = Model([test_prev_phone_input, test_h, test_c, test_encoding_input], [test_out,out_h,out_c])
    
    return training_model, testing_encoder_model, testing_decoder_model


Using TensorFlow backend.


In [16]:
gc.collect()

0

In [17]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

h0 = np.zeros((emb_char_input_train.shape[0], 256))
c0 = np.zeros((emb_char_input_train.shape[0], 256))
inputs = list(emb_phone_input_train.swapaxes(0,1))
outputs = list(phone_output_train.swapaxes(0,1))

def train_attention(model, weights_path, validation_size=0.1, epochs=5):    
    
    
    callbacks = []
    if validation_size > 0:
        checkpointer = ModelCheckpoint(filepath=weights_path, verbose=1, save_best_only=True)
        stopper = EarlyStopping(monitor='val_loss',patience=3)
        callbacks = [checkpointer, stopper]

    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit([emb_char_input_train, h0, c0] + inputs, outputs,
              batch_size=256,
              epochs=epochs,
              validation_split=validation_size,
              callbacks=callbacks)
    
    if validation_size == 0:
        model.save_weights(weights_path)

In [18]:
gc.collect()

0

#### tran_loss  = 1.3416, val_loss = 1.2324, train_acc = 0.704, test_acc = 0.685, kaggle acc = 67.2

#### train_loss = 1.3298   val_loss = 1.656   train_acc = 0.724  test_acc = 0.680    kaggle_acc = 67.9

#### train_loss = 1.1843, val_loss = 1.1958, train_acc = 0.734, test_acc = 0.696, kaggle_acc = 68.579

#### train_loss = 1.0159, val_loss = 1.13572, train_acc = 0.77, test_acc = 0.685, kaggle_acc = 0.70237

In [22]:
ATTENTION_MODEL_WEIGHTS = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/attention_model_weights.hdf5"
attn_training_model, attn_testing_encoder_model, attn_testing_decoder_model = attention_model()
attn_training_model.load_weights(ATTENTION_MODEL_WEIGHTS)

train_attention(attn_training_model, ATTENTION_MODEL_WEIGHTS)

Train on 74700 samples, validate on 8300 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.85295, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/attention_model_weights.hdf5


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They 

Epoch 2/5

Epoch 00002: val_loss did not improve from 0.85295
Epoch 3/5

Epoch 00003: val_loss did not improve from 0.85295
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.85295


In [19]:
def id_vec_to_word(emb_char_seq):
    word = ''
    for char_id in emb_char_seq[0]:
        char = id_to_char[char_id]
        word += char
    return word.strip()

In [20]:
def predict_attention(input_char_seq, encoder, decoder):
    encoder_outputs = encoder.predict(input_char_seq) 

    output_phone_seq = np.array([[phone_to_id[START_PHONE_SYM]]])
    
    h = np.zeros((emb_char_input_train.shape[0], 256))
    c = np.zeros((emb_char_input_train.shape[0], 256))
    
    end_found = False 
    pronunciation = '' 
    while not end_found:
        decoder_output, h, c = decoder.predict([output_phone_seq, h, c, encoder_outputs])
        
        # Predict the phoneme with the highest probability
        predicted_phone_idx = np.argmax(decoder_output[0,:])
        predicted_phone = id_to_phone[predicted_phone_idx]
        
        pronunciation += predicted_phone + '_'
        
        if predicted_phone == END_PHONE_SYM or len(pronunciation.split('_')) > MAX_PHONE_SEQ_LEN: 
            end_found = True
        
        # Setup inputs for next time step
        output_phone_seq = np.array([[predicted_phone_idx]])
        
    return pronunciation.strip('_')

In [21]:
def get_accuracy(x_data, y_data):
    char_seqs = []
    
    for word in x_data:
        word_matrix = np.zeros((MAX_CHAR_SEQ_LEN))
        for t, char in enumerate(word):
            word_matrix[t] = char_to_id[char]
        char_seqs.append(word_matrix)

    char_seq_matrix_test = np.array(char_seqs)
    
    y_predicted = []
    for i in range(len(char_seq_matrix_test)):
        example_char_seq = char_seq_matrix_test[i:i+1]
        predicted_pronun = predict_attention(example_char_seq, attn_testing_encoder_model, attn_testing_decoder_model)
        predicted_pronun = predicted_pronun[:-2] #strip _e symbol
        
        if i % 100 == 0:
            print(i, '/', len(char_seq_matrix_test))
            print("Word:", x_data[i])
            print("Transcription:", y_data[i])
            print("Prediction:", predicted_pronun)
            
        y_predicted.append(predicted_pronun)
    correct_num = 0
    for i in range(len(x_data)):
        if y_data[i] == y_predicted[i]:
            correct_num += 1
    
    print("Prediction finished!!!")
    return correct_num/len(x_data)

In [23]:
ATTENTION_MODEL_WEIGHTS = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/attention_model_weights_70237.hdf5"
attn_training_model, attn_testing_encoder_model, attn_testing_decoder_model = attention_model()
attn_training_model.load_weights(ATTENTION_MODEL_WEIGHTS)

In [24]:
print("Accuracy on train:", get_accuracy(x_train[:500], y_train[:500]))

0 / 500
Word: LEMIEUX
Transcription: L_AH_M_Y_UW
Prediction: L_IY_M_IY_OW
100 / 500
Word: DISSENT
Transcription: D_IH_S_EH_N_T
Prediction: D_IH_S_EH_N_T
200 / 500
Word: ROACHE
Transcription: R_OW_CH
Prediction: R_OW_CH
300 / 500
Word: MERCHANTS
Transcription: M_ER_CH_AH_N_T_S
Prediction: M_ER_CH_AH_N_T_S
400 / 500
Word: ABBASI
Transcription: AA_B_AA_S_IY
Prediction: AE_B_AA_S_IY
Prediction finished!!!
Accuracy on train: 0.77


In [25]:
x_test = x_train_copy[83000:]
y_test = y_train_copy[83000:]
print("Accuracy on test:", get_accuracy(x_test, y_test))

0 / 194
Word: SAMANTHA
Transcription: S_AH_M_AE_N_TH_AH
Prediction: S_AH_M_AE_N_TH_AH
100 / 194
Word: DISDAINED
Transcription: D_IH_S_D_EY_N_D
Prediction: D_IH_S_D_EY_N_D
Prediction finished!!!
Accuracy on test: 0.6855670103092784


In [28]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


def plot_embeddings(embeddings, symbols, perplexity):
    embeddings_in_2D = TSNE(n_components=2,perplexity=perplexity).fit_transform(embeddings)
    embeddings_in_2D[:,0] = embeddings_in_2D[:,0] / np.max(np.abs(embeddings_in_2D[:,0]))
    embeddings_in_2D[:,1] = embeddings_in_2D[:,1] / np.max(np.abs(embeddings_in_2D[:,1]))

    fig, ax = plt.subplots()
    fig.set_size_inches(6,6)
    ax.scatter(embeddings_in_2D[:,0], embeddings_in_2D[:,1],c='w')

    for i, letter in enumerate(symbols):
        ax.annotate(letter, (embeddings_in_2D[i,0],embeddings_in_2D[i,1]), fontsize=12, fontweight='bold')
        
        
char_embedding = emb_training_model.layers[2].get_weights()[0]
plot_embeddings(char_embedding, char_to_id.keys(), 5)

phone_embedding = emb_training_model.layers[3].get_weights()[0]
plot_embeddings(phone_embedding, phone_to_id.keys(), 18)

NameError: name 'emb_training_model' is not defined

#### Test the model:

In [30]:
gc.collect()

1268

In [26]:
import pandas as pd

In [27]:
test_data = pd.read_csv("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/test.csv")
x_test = list(test_data['Word'])

In [28]:
char_seqs = []
    
for word in x_test:
    word_matrix = np.zeros((MAX_CHAR_SEQ_LEN))
    for t, char in enumerate(word):
        word_matrix[t] = char_to_id[char]
    char_seqs.append(word_matrix)

char_seq_matrix_test = np.array(char_seqs)

In [29]:
print(len(char_seq_matrix_test))
print(char_seq_matrix_test.shape)

41597
(41597, 34)


In [36]:
def predict_beamsearch(input_char_seq, encoder, decoder, k=3):
    a = encoder.predict(input_char_seq) 
    
    s = np.zeros((emb_char_input_train.shape[0], 256))
    c = np.zeros((emb_char_input_train.shape[0], 256))
    
    all_seqs = []
    all_seq_scores = []
    
    live_seqs = [[phone_to_id[START_PHONE_SYM]]]
    live_scores = [0]
    live_states = [[s,c]]

    while len(live_seqs) > 0: 
        new_live_seqs = [] 
        new_live_scores = [] 
        new_live_states = []
        
        for sidx,seq in enumerate(live_seqs):
            target_seq = np.array([[seq[-1]]])
            output_token_probs, s, c = decoder.predict([target_seq] + live_states[sidx] + [a])
            
            best_token_indicies = output_token_probs[0,:].argsort()[-k:]

            for token_index in best_token_indicies:
                new_seq = seq + [token_index]
                prob = output_token_probs[0,:][token_index]
                new_seq_score = live_scores[sidx] - np.log(prob)
                if id_to_phone[token_index] == END_PHONE_SYM or len(new_seq) > MAX_PHONE_SEQ_LEN:
                    all_seqs.append(new_seq) 
                    all_seq_scores.append(new_seq_score) 
                    continue
                new_live_seqs.append(new_seq)
                new_live_scores.append(new_seq_score)
                new_live_states.append([s, c])
                
        while len(new_live_scores) > k:
            worst_seq_score_idx = np.array(new_live_scores).argsort()[-1] 
            del new_live_seqs[worst_seq_score_idx]
            del new_live_scores[worst_seq_score_idx]
            del new_live_states[worst_seq_score_idx]
            
        live_seqs = new_live_seqs
        live_scores = new_live_scores
        live_states = new_live_states
        
    best_idx = np.argmin(all_seq_scores)

    pronunciation = ''
    for i in all_seqs[best_idx]:
        pronunciation += id_to_phone[i] + '_'
    return pronunciation[2:-3]

In [37]:
y_test = []
for i in range(len(char_seq_matrix_test)):
    example_char_seq = char_seq_matrix_test[i:i+1]
    predicted_pronun = predict_beamsearch(example_char_seq, attn_testing_encoder_model, attn_testing_decoder_model)
        
    if i % 100 == 0:
        print(i, '/', len(char_seq_matrix_test))
            
    y_test.append(predicted_pronun)

0 / 41597
100 / 41597
200 / 41597
300 / 41597
400 / 41597
500 / 41597
600 / 41597
700 / 41597
800 / 41597
900 / 41597
1000 / 41597
1100 / 41597
1200 / 41597
1300 / 41597
1400 / 41597
1500 / 41597
1600 / 41597
1700 / 41597
1800 / 41597
1900 / 41597
2000 / 41597
2100 / 41597
2200 / 41597
2300 / 41597
2400 / 41597
2500 / 41597
2600 / 41597
2700 / 41597
2800 / 41597
2900 / 41597
3000 / 41597
3100 / 41597


KeyboardInterrupt: 

In [35]:
print(x_test[:10])
print(y_test[:10])

['PITCHED', 'DISSOLVERS', 'SCRAWNY', 'BONENFANT', 'EXCEEDS', 'BARTNICKI', 'BUTE', 'CAPITULATE', 'STEAM', 'INVESTCORP']
['P_IH_CH_T', 'D_IH_S_AA_L_V_ER_Z', 'S_K_R_AO_N_IY', 'B_OW_N_AH_N_F_AH_N_T', 'IH_K_S_IY_D_Z', 'B_AA_R_T_N_IH_T_S_K_IY', 'B_Y_UW_T', 'K_AH_P_IH_CH_AH_L_EY_T', 'S_T_IY_M', 'IH_N_V_EH_S_T_K_AO_R_P']


In [40]:
submission = pd.read_csv("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/test.csv")
submission['Word'] = y_test

In [41]:
print(submission.head())

   Id                  Word
0   1             P_IH_CH_T
1   2    D_IH_S_AA_L_V_ER_Z
2   3         S_K_R_AO_N_IY
3   4  B_OW_N_AH_N_F_AH_N_T
4   5         IH_K_S_IY_D_Z


In [36]:
print(submission.tail())

          Id                         Word
41592  41593  IH_N_AA_K_Y_AH_L_EY_SH_AH_N
41593  41594                    AH_N_T_UW
41594  41595                S_K_OW_G_IH_N
41595  41596                HH_EH_SH_AH_N
41596  41597           T_ER_N_AO_F_S_K_IY


In [43]:
submission.to_csv("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/submission_baseline_model2_attention.csv")

In [75]:
perfect_acc, avg_bleu_score = evaluate(
    char_input_test, testing_encoder_model, testing_decoder_model, one_hot_matrix_to_word, predict_baseline)
print_results('Baseline Model',perfect_acc, avg_bleu_score)

Baseline Model
--------------------
Perfect Accuracy: 0.0%
Bleu Score: 0.3248
