In [1]:
import re
import os
import random
import numpy as np
import gc

In [2]:
train_file = open("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/train.txt","r")
train = train_file.read()
train = train.split('\n')
train = train[:-1]
print("Examples in test dataset:", len(train))
x_train = [data_example.split(' ')[0] for data_example in train]
y_train = [data_example.split(' ')[1] for data_example in train]

Examples in test dataset: 83194


In [3]:
x_train_copy = x_train.copy()
y_train_copy = y_train.copy()

In [4]:
x_train = x_train_copy[:80000]
y_train = y_train_copy[:80000]
x_test = x_train_copy[80000:]
y_test = y_train_copy[80000:]

In [5]:
def get_phonetic_dict(x, y):
    phonetic_dict = {}
    for i in range(len(x)):
        if x[i] not in phonetic_dict:
            phonetic_dict[x[i]] = []
        phonetic_dict[x[i]].append(y[i])
    return phonetic_dict

In [6]:
phonetic_dict = get_phonetic_dict(x_train, y_train)
example_count = np.sum([len(prons) for _, prons in phonetic_dict.items()])

In [7]:
print("\n".join([k+' --> '+phonetic_dict[k][0] for k in random.sample(list(phonetic_dict.keys()), 10)]))
print('\nAfter cleaning, the dictionary contains %s words and %s pronunciations (%s are alternate pronunciations).' % 
      (len(phonetic_dict), example_count, (example_count-len(phonetic_dict))))

PERFECTED --> P_ER_F_EH_K_T_AH_D
MORRISSETTE --> M_AO_R_IH_S_EH_T
GILLON --> G_IH_L_AH_N
GEORGIO --> JH_AO_R_JH_IY_OW
SIDEWISE --> S_AY_D_W_AY_Z
PETWAY --> P_EH_T_W_EY
DISPUTES --> D_IH_S_P_Y_UW_T_S
PALKA --> P_AE_L_K_AH
DONLIN --> D_AA_N_L_IH_N
ZANCA --> Z_AE_NG_K_AH

After cleaning, the dictionary contains 80000 words and 80000 pronunciations (0 are alternate pronunciations).


In [8]:
import string

START_PHONE_SYM = 's'
END_PHONE_SYM = 'e'


def char_list():
    allowed_symbols = []
    for word in x_train:
        for char in word:
            if char not in allowed_symbols:
                allowed_symbols.append(char)
    return allowed_symbols


def phone_list():
    phone_list = [START_PHONE_SYM, END_PHONE_SYM]
    for transcription in y_train:
        for phone in transcription.split('_'):
            if phone not in phone_list:
                phone_list.append(phone)
    return [''] + phone_list


def id_mappings_from_list(str_list):
    str_to_id = {s: i for i, s in enumerate(str_list)} 
    id_to_str = {i: s for i, s in enumerate(str_list)}
    return str_to_id, id_to_str


# Create character to ID mappings
char_to_id, id_to_char = id_mappings_from_list(char_list())

# Load phonetic symbols and create ID mappings
phone_to_id, id_to_phone = id_mappings_from_list(phone_list())

# Example:
print('Char to id mapping: \n', char_to_id)
print('Phone to id mapping: \n', phone_to_id)

Char to id mapping: 
 {'L': 0, 'E': 1, 'M': 2, 'I': 3, 'U': 4, 'X': 5, 'N': 6, 'D': 7, 'G': 8, 'S': 9, 'T': 10, 'R': 11, 'P': 12, 'K': 13, 'C': 14, 'O': 15, 'F': 16, 'A': 17, 'B': 18, 'H': 19, 'V': 20, 'Y': 21, 'W': 22, 'J': 23, "'": 24, 'Q': 25, 'Z': 26, '-': 27}
Phone to id mapping: 
 {'': 0, 's': 1, 'e': 2, 'L': 3, 'AH': 4, 'M': 5, 'Y': 6, 'UW': 7, 'AY': 8, 'N': 9, 'D': 10, 'IH': 11, 'NG': 12, 'S': 13, 'T': 14, 'R': 15, 'P': 16, 'K': 17, 'EH': 18, 'AA': 19, 'F': 20, 'ER': 21, 'EY': 22, 'AE': 23, 'Z': 24, 'G': 25, 'B': 26, 'SH': 27, 'V': 28, 'OW': 29, 'AO': 30, 'IY': 31, 'W': 32, 'HH': 33, 'JH': 34, 'CH': 35, 'TH': 36, 'AW': 37, 'OY': 38, 'UH': 39, 'ZH': 40, 'DH': 41}


In [9]:
CHAR_TOKEN_COUNT = len(char_to_id)
PHONE_TOKEN_COUNT = len(phone_to_id)


def char_to_1_hot(char):
    char_id = char_to_id[char]
    hot_vec = np.zeros((CHAR_TOKEN_COUNT))
    hot_vec[char_id] = 1.
    return hot_vec


def phone_to_1_hot(phone):
    phone_id = phone_to_id[phone]
    hot_vec = np.zeros((PHONE_TOKEN_COUNT))
    hot_vec[phone_id] = 1.
    return hot_vec

# Example:
print('"A" is represented by:\n', char_to_1_hot('A'), '\n-----')
print('"AH" is represented by:\n', phone_to_1_hot('AH'))

"A" is represented by:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] 
-----
"AH" is represented by:
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [10]:
gc.collect()

0

In [11]:
MAX_CHAR_SEQ_LEN = max([len(word) for word, _ in phonetic_dict.items()])
MAX_PHONE_SEQ_LEN = max([max([len(pron.split('_')) for pron in pronuns]) 
                         for _, pronuns in phonetic_dict.items()]
                       ) + 2  # + 2 to account for the start & end tokens we need to add


def dataset_to_1_hot_tensors():
    char_seqs = []
    phone_seqs = []
    
    for word, pronuns in phonetic_dict.items():
        word_matrix = np.zeros((MAX_CHAR_SEQ_LEN, CHAR_TOKEN_COUNT))
        for t, char in enumerate(word):
            word_matrix[t, :] = char_to_1_hot(char)
        for pronun in pronuns:
            pronun_matrix = np.zeros((MAX_PHONE_SEQ_LEN, PHONE_TOKEN_COUNT))
            phones = [START_PHONE_SYM] + pronun.split('_') + [END_PHONE_SYM]
            for t, phone in enumerate(phones):
                pronun_matrix[t,:] = phone_to_1_hot(phone)
                
            char_seqs.append(word_matrix)
            phone_seqs.append(pronun_matrix)
    
    return np.array(char_seqs), np.array(phone_seqs)
            

char_seq_matrix, phone_seq_matrix = dataset_to_1_hot_tensors()        
print('Word Matrix Shape: ', char_seq_matrix.shape)
print('Pronunciation Matrix Shape: ', phone_seq_matrix.shape)

Word Matrix Shape:  (80000, 34, 28)
Pronunciation Matrix Shape:  (80000, 34, 42)


In [12]:
gc.collect()

0

### Model

In [13]:
phone_seq_matrix_decoder_output = np.pad(phone_seq_matrix,((0,0),(0,1),(0,0)), mode='constant')[:,1:,:]

In [14]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

def baseline_model(hidden_nodes = 256):
    
    # Shared Components - Encoder
    char_inputs = Input(shape=(None, CHAR_TOKEN_COUNT))
    encoder = LSTM(hidden_nodes, return_state=True)
    
    # Shared Components - Decoder
    phone_inputs = Input(shape=(None, PHONE_TOKEN_COUNT))
    decoder = LSTM(hidden_nodes, return_sequences=True, return_state=True)
    decoder_dense = Dense(PHONE_TOKEN_COUNT, activation='softmax')
    
    # Training Model
    _, state_h, state_c = encoder(char_inputs) # notice encoder outputs are ignored
    encoder_states = [state_h, state_c]
    decoder_outputs, _, _ = decoder(phone_inputs, initial_state=encoder_states)
    phone_prediction = decoder_dense(decoder_outputs)

    training_model = Model([char_inputs, phone_inputs], phone_prediction)
    
    # Testing Model - Encoder
    testing_encoder_model = Model(char_inputs, encoder_states)
    
    # Testing Model - Decoder
    decoder_state_input_h = Input(shape=(hidden_nodes,))
    decoder_state_input_c = Input(shape=(hidden_nodes,))
    decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, decoder_state_h, decoder_state_c = decoder(phone_inputs, initial_state=decoder_state_inputs)
    decoder_states = [decoder_state_h, decoder_state_c]
    phone_prediction = decoder_dense(decoder_outputs)
    
    testing_decoder_model = Model([phone_inputs] + decoder_state_inputs, [phone_prediction] + decoder_states)
    
    return training_model, testing_encoder_model, testing_decoder_model

Using TensorFlow backend.


In [15]:
gc.collect()

0

In [16]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0
    
(char_input_train, char_input_test, 
 phone_input_train, phone_input_test, 
 phone_output_train, phone_output_test) = train_test_split(
    char_seq_matrix, phone_seq_matrix, phone_seq_matrix_decoder_output, 
    test_size=TEST_SIZE, random_state=42)

TEST_EXAMPLE_COUNT = char_input_test.shape[0]

In [17]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

def train(model, weights_path, encoder_input, decoder_input, decoder_output):
    checkpointer = ModelCheckpoint(filepath=weights_path, verbose=1, save_best_only=True)
    stopper = EarlyStopping(monitor='val_loss',patience=3)
    
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit([encoder_input, decoder_input], decoder_output,
          batch_size=256,
          epochs=50,
          validation_split=0.2, # Keras will automatically create a validation set for us
          callbacks=[checkpointer, stopper])

In [36]:
BASELINE_MODEL_WEIGHTS = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5"
training_model, testing_encoder_model, testing_decoder_model = baseline_model()

train(training_model, BASELINE_MODEL_WEIGHTS, char_input_train, phone_input_train, phone_output_train)

Train on 48000 samples, validate on 12000 samples
Epoch 1/50

KeyboardInterrupt: 

In [18]:
def predict_baseline(input_char_seq, encoder, decoder):
    state_vectors = encoder.predict(input_char_seq) 
    
    prev_phone = np.zeros((1, 1, PHONE_TOKEN_COUNT))
    prev_phone[0, 0, phone_to_id[START_PHONE_SYM]] = 1.
    
    end_found = False 
    pronunciation = '' 
    while not end_found:
        decoder_output, h, c = decoder.predict([prev_phone] + state_vectors)
        
        # Predict the phoneme with the highest probability
        predicted_phone_idx = np.argmax(decoder_output[0, -1, :])
        predicted_phone = id_to_phone[predicted_phone_idx]
        
        pronunciation += predicted_phone + '_'
        
        if predicted_phone == END_PHONE_SYM or len(pronunciation.split('_')) > MAX_PHONE_SEQ_LEN: 
            end_found = True
        
        # Setup inputs for next time step
        prev_phone = np.zeros((1, 1, PHONE_TOKEN_COUNT))
        prev_phone[0, 0, predicted_phone_idx] = 1.
        state_vectors = [h, c]
        
    return pronunciation.strip()

In [19]:
# Helper method for converting vector representations back into words
def one_hot_matrix_to_word(char_seq):
    word = ''
    for char_vec in char_seq[0]:
        if np.count_nonzero(char_vec) == 0:
            break
        hot_bit_idx = np.argmax(char_vec)
        char = id_to_char[hot_bit_idx]
        word += char
    return word


# Some words have multiple correct pronunciations
# If a prediction matches any correct pronunciation, consider it correct.
def is_correct(word,test_pronunciation):
    correct_pronuns = phonetic_dict[word]
    for correct_pronun in correct_pronuns:
        if test_pronunciation == correct_pronun:
            return True
    return False


def sample_baseline_predictions(sample_count, word_decoder):
    TEST_EXAMPLE_COUNT = char_input_test.shape[0]
    
    sample_indices = random.sample(range(TEST_EXAMPLE_COUNT), sample_count)
    counter = 0
    iter_num = 0
    for example_idx in sample_indices:
        example_char_seq = char_input_test[example_idx:example_idx+1]
        predicted_pronun = predict_baseline(example_char_seq, testing_encoder_model, testing_decoder_model)
        example_word = word_decoder(example_char_seq)
        pred_is_correct = is_correct(example_word, predicted_pronun)
        
        if predicted_pronun[:-3] == phonetic_dict[example_word][0]:
            counter += 1
        '''
        print('✅ ' if predicted_pronun[:-3] == phonetic_dict[example_word][0] else '❌ ')
        print("Word:", example_word)
        print("Transcription:", phonetic_dict[example_word][0])
        print("Prediction:", predicted_pronun[:-3])
        print()
        '''
        iter_num += 1
        if iter_num % 100 == 0:
            print(iter_num, "/", sample_count)
    print("Accuracy = ", counter/sample_count)


In [20]:
def get_accuracy(x_data, y_data):
    char_seqs = []
    
    for word in x_data:
        word_matrix = np.zeros((MAX_CHAR_SEQ_LEN, CHAR_TOKEN_COUNT))
        for t, char in enumerate(word):
            word_matrix[t, :] = char_to_1_hot(char)
        char_seqs.append(word_matrix)

    char_seq_matrix_test = np.array(char_seqs)
    
    y_predicted = []
    for i in range(len(char_seq_matrix_test)):
        if i % 1000 == 0:
            print(i, '/', len(char_seq_matrix_test))
        example_char_seq = char_seq_matrix_test[i:i+1]
        predicted_pronun = predict_baseline(example_char_seq, testing_encoder_model, testing_decoder_model)
        predicted_pronun = predicted_pronun[:-3]
        y_predicted.append(predicted_pronun)
        
    correct_num = 0
    for i in range(len(x_data)):
        if y_data[i] == y_predicted[i]:
            correct_num += 1
    
    print("Prediction finished!!!")
    return correct_num/len(x_data)

In [21]:
training_model, testing_encoder_model, testing_decoder_model = baseline_model()
BASELINE_MODEL_WEIGHTS = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5"
training_model.load_weights(BASELINE_MODEL_WEIGHTS)  # also loads weights for testing models

In [22]:
print("Accuracy on train:", get_accuracy(x_train[:10000], y_train[:10000]))

0 / 10000
1000 / 10000
2000 / 10000
3000 / 10000
4000 / 10000
5000 / 10000
6000 / 10000
7000 / 10000
8000 / 10000
9000 / 10000
Prediction finished!!!
Accuray on train: 0.6247


In [23]:
print("Accuracy on test:", get_accuracy(x_test, y_test))

0 / 3194
1000 / 3194
2000 / 3194
3000 / 3194
Prediction finished!!!
Accuracy on test: 0.4542892924232937


#### Let's count accuracy on unseen data:

In [22]:
training_model, testing_encoder_model, testing_decoder_model = baseline_model()

In [24]:
BASELINE_MODEL_WEIGHTS = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5"
training_model.load_weights(BASELINE_MODEL_WEIGHTS)  # also loads weights for testing models

In [25]:
x_test = x_train_copy[60000:]
y_test = y_train_copy[60000:]

In [26]:
char_seqs = []
    
for word in x_test:
    word_matrix = np.zeros((MAX_CHAR_SEQ_LEN, CHAR_TOKEN_COUNT))
    for t, char in enumerate(word):
        word_matrix[t, :] = char_to_1_hot(char)
    char_seqs.append(word_matrix)

char_seq_matrix_test = np.array(char_seqs)

In [27]:
y_predicted = []
for i in range(len(char_seq_matrix_test)):
    if i % 1000 == 0:
        print(i, '/', len(char_seq_matrix_test))
    example_char_seq = char_seq_matrix_test[i:i+1]
    predicted_pronun = predict_baseline(example_char_seq, testing_encoder_model, testing_decoder_model)
    predicted_pronun = predicted_pronun[:-3]
    y_predicted.append(predicted_pronun)

0 / 23194
1000 / 23194
2000 / 23194
3000 / 23194
4000 / 23194
5000 / 23194
6000 / 23194
7000 / 23194
8000 / 23194
9000 / 23194
10000 / 23194
11000 / 23194
12000 / 23194
13000 / 23194
14000 / 23194
15000 / 23194
16000 / 23194
17000 / 23194
18000 / 23194
19000 / 23194
20000 / 23194
21000 / 23194
22000 / 23194
23000 / 23194


In [34]:
correct_num = 0
for i in range(len(x_test)):
    if y_test[i] == y_predicted[i]:
        correct_num += 1

print("Accuracy on test = ", correct_num/len(x_test))

Accuracy on test =  0.45235836854358885


#### Test the model:

In [25]:
gc.collect()

183

In [33]:
training_model.load_weights(BASELINE_MODEL_WEIGHTS)  # also loads weights for testing models
sample_baseline_predictions(len(char_input_test), one_hot_matrix_to_word)

100 / 10000
200 / 10000
300 / 10000
400 / 10000
500 / 10000
600 / 10000
700 / 10000
800 / 10000
900 / 10000
1000 / 10000
1100 / 10000
1200 / 10000
1300 / 10000
1400 / 10000
1500 / 10000
1600 / 10000
1700 / 10000
1800 / 10000
1900 / 10000
2000 / 10000
2100 / 10000
2200 / 10000
2300 / 10000
2400 / 10000
2500 / 10000
2600 / 10000
2700 / 10000
2800 / 10000
2900 / 10000
3000 / 10000
3100 / 10000
3200 / 10000
3300 / 10000
3400 / 10000
3500 / 10000
3600 / 10000
3700 / 10000
3800 / 10000
3900 / 10000
4000 / 10000
4100 / 10000
4200 / 10000
4300 / 10000
4400 / 10000
4500 / 10000
4600 / 10000
4700 / 10000
4800 / 10000
4900 / 10000
5000 / 10000
5100 / 10000
5200 / 10000
5300 / 10000
5400 / 10000
5500 / 10000
5600 / 10000
5700 / 10000
5800 / 10000
5900 / 10000
6000 / 10000
6100 / 10000
6200 / 10000
6300 / 10000
6400 / 10000
6500 / 10000
6600 / 10000
6700 / 10000
6800 / 10000
6900 / 10000
7000 / 10000
7100 / 10000
7200 / 10000
7300 / 10000
7400 / 10000
7500 / 10000
7600 / 10000
7700 / 10000
7800 / 1

In [35]:
import pandas as pd

In [36]:
test_data = pd.read_csv("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/test.csv")
x_test = list(test_data['Word'])

Unnamed: 0,Id,Word
0,1,PITCHED
1,2,DISSOLVERS
2,3,SCRAWNY
3,4,BONENFANT
4,5,EXCEEDS


In [38]:
char_seqs = []
    
for word in x_test:
    word_matrix = np.zeros((MAX_CHAR_SEQ_LEN, CHAR_TOKEN_COUNT))
    for t, char in enumerate(word):
        word_matrix[t, :] = char_to_1_hot(char)
    char_seqs.append(word_matrix)

char_seq_matrix_test = np.array(char_seqs)

In [39]:
print(len(char_seq_matrix_test))
print(char_seq_matrix_test.shape)

41597
(41597, 34, 28)


In [41]:
y_test = []
for i in range(len(char_seq_matrix_test)):
    if i % 1000 == 0:
        print(i, '/', len(char_seq_matrix_test))
    example_char_seq = char_seq_matrix_test[i:i+1]
    predicted_pronun = predict_baseline(example_char_seq, testing_encoder_model, testing_decoder_model)
    predicted_pronun = predicted_pronun[:-3]
    y_test.append(predicted_pronun)

0 / 41597
1000 / 41597
2000 / 41597
3000 / 41597
4000 / 41597
5000 / 41597
6000 / 41597
7000 / 41597
8000 / 41597
9000 / 41597
10000 / 41597
11000 / 41597
12000 / 41597
13000 / 41597
14000 / 41597
15000 / 41597
16000 / 41597
17000 / 41597
18000 / 41597
19000 / 41597
20000 / 41597
21000 / 41597
22000 / 41597
23000 / 41597
24000 / 41597
25000 / 41597
26000 / 41597
27000 / 41597
28000 / 41597
29000 / 41597
30000 / 41597
31000 / 41597
32000 / 41597
33000 / 41597
34000 / 41597
35000 / 41597
36000 / 41597
37000 / 41597
38000 / 41597
39000 / 41597
40000 / 41597
41000 / 41597


In [42]:
print(x_test[:10])
print(y_test[:10])

['PITCHED', 'DISSOLVERS', 'SCRAWNY', 'BONENFANT', 'EXCEEDS', 'BARTNICKI', 'BUTE', 'CAPITULATE', 'STEAM', 'INVESTCORP']
['P_IH_CH_T', 'D_IH_S_AA_L_V_ER_Z', 'S_K_R_AO_N_IY', 'B_AA_N_AH_N_F_AE_N_T', 'IH_K_S_EH_S_T_IH_D', 'B_AA_R_T_N_IH_K_S', 'B_Y_UW_T', 'K_AE_P_IH_T_UW_L_EY_T', 'S_T_IY_M', 'IH_N_V_EH_S_T_P_R_AE_K_T']


In [43]:
submission = pd.read_csv("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/test.csv")
submission['Word'] = y_test

In [44]:
print(submission.head())

   Id                  Word
0   1             P_IH_CH_T
1   2    D_IH_S_AA_L_V_ER_Z
2   3         S_K_R_AO_N_IY
3   4  B_AA_N_AH_N_F_AE_N_T
4   5    IH_K_S_EH_S_T_IH_D


In [45]:
submission.to_csv("/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/submission_baseline_model2.csv")

In [74]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    
def bleu_score(word,test_pronunciation):
    references = [pronun.split('_') for pronun in phonetic_dict[word]]
    smooth = SmoothingFunction().method1
    return sentence_bleu(references, test_pronunciation.split('_'), smoothing_function=smooth)


def evaluate(test_examples, encoder, decoder, word_decoder, predictor):
    perfect_predictions = 0
    bleu_scores = []
    
    for example_idx in range(TEST_EXAMPLE_COUNT):
        example_char_seq = test_examples[example_idx:example_idx+1]
        predicted_pronun = predictor(example_char_seq, encoder, decoder)
        example_word = word_decoder(example_char_seq)
        
        perfect_predictions += is_correct(example_word,predicted_pronun)

        bleu = bleu_score(example_word,predicted_pronun)
        bleu_scores.append(bleu)
        
    perfect_acc = perfect_predictions / TEST_EXAMPLE_COUNT
    avg_bleu_score = np.mean(bleu_scores)
    
    return perfect_acc, avg_bleu_score


def print_results(model_name, perfect_acc, avg_bleu_score):
    print(model_name)
    print('-'*20)
    print('Perfect Accuracy: %s%%' % round(perfect_acc*100, 1))
    print('Bleu Score: %s' % round(avg_bleu_score, 4))


In [75]:
perfect_acc, avg_bleu_score = evaluate(
    char_input_test, testing_encoder_model, testing_decoder_model, one_hot_matrix_to_word, predict_baseline)
print_results('Baseline Model',perfect_acc, avg_bleu_score)

Baseline Model
--------------------
Perfect Accuracy: 0.0%
Bleu Score: 0.3248
