In [1]:
import re
import os
import random
import numpy as np

# To make sure our kernel runs all the way through and gets saved,
# we'll trim some things back and skip training
IS_KAGGLE = True 

CMU_DICT_PATH = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/model_example/cmudict-0.7b"

CMU_SYMBOLS_PATH = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/model_example/cmudict.symbols"

# Skip words with numbers or symbols
ILLEGAL_CHAR_REGEX = "[^A-Z-'.]"

# Only 3 words are longer than 20 chars
# Setting a limit now simplifies training our model later
MAX_DICT_WORD_LEN = 20
MIN_DICT_WORD_LEN = 2


def load_clean_phonetic_dictionary():

    def is_alternate_pho_spelling(word):
        # No word has > 9 alternate pronounciations so this is safe
        return word[-1] == ')' and word[-3] == '(' and word[-2].isdigit() 

    def should_skip(word):
        if not word[0].isalpha():  # skip symbols
            return True
        if word[-1] == '.':  # skip abbreviations
            return True
        if re.search(ILLEGAL_CHAR_REGEX, word):
            return True
        if len(word) > MAX_DICT_WORD_LEN:
            return True
        if len(word) < MIN_DICT_WORD_LEN:
            return True
        return False

    phonetic_dict = {}
    with open(CMU_DICT_PATH, encoding="ISO-8859-1") as cmu_dict:
        for line in cmu_dict:

            # Skip commented lines
            if line[0:3] == ';;;':
                continue

            word, phonetic = line.strip().split('  ')

            # Alternate pronounciations are formatted: "WORD(#)  F AH0 N EH1 T IH0 K"
            # We don't want to the "(#)" considered as part of the word
            if is_alternate_pho_spelling(word):
                word = word[:word.find('(')]

            if should_skip(word):
                continue

            if word not in phonetic_dict:
                phonetic_dict[word] = []
            phonetic_dict[word].append(phonetic)

    if IS_KAGGLE: # limit dataset to 5,000 words
        phonetic_dict = {key:phonetic_dict[key] 
                         for key in random.sample(list(phonetic_dict.keys()), 5000)}
    return phonetic_dict

phonetic_dict = load_clean_phonetic_dictionary()
example_count = np.sum([len(prons) for _, prons in phonetic_dict.items()])

In [2]:
print("\n".join([k+' --> '+phonetic_dict[k][0] for k in random.sample(list(phonetic_dict.keys()), 10)]))
print('\nAfter cleaning, the dictionary contains %s words and %s pronunciations (%s are alternate pronunciations).' % 
      (len(phonetic_dict), example_count, (example_count-len(phonetic_dict))))

BECKONS --> B EH1 K AH0 N Z
RAZO --> R AA1 Z OW0
STENY --> S T EH1 N IY0
MOBLEY --> M OW1 B L IY0
FOISTED --> F OY1 S T IH0 D
FREQUENCIES --> F R IY1 K W AH0 N S IY0 Z
AMERICARE --> AH0 M EH1 R IH0 K EH2 R
SOLERI --> S OW0 L EH1 R IY0
KICKOFF --> K IH1 K AO2 F
TANGLE --> T AE1 NG G AH0 L

After cleaning, the dictionary contains 5000 words and 5304 pronunciations (304 are alternate pronunciations).


In [3]:
print(list(phonetic_dict.values())[:10])

[['F ER1 Z'], ['K AA1 N T R AE0 S T S', 'K AH0 N T R AE1 S T S'], ['M AY1 D IH0 NG ER0'], ['N AA2 N P R AH0 D AH1 K T IH0 V'], ['D R EH1 S AH0 N'], ['P AE1 N Z'], ['D AH1 G AW2 T'], ['L IH1 N L IY0'], ['IH2 M P AA1 V R IH0 SH'], ['HH AA1 P M AH0 N']]


In [33]:
import string

START_PHONE_SYM = '\t'
END_PHONE_SYM = '\n'


def char_list():
    allowed_symbols = [".", "-", "'"]
    uppercase_letters = list(string.ascii_uppercase)
    return [''] + allowed_symbols + uppercase_letters


def phone_list():
    phone_list = [START_PHONE_SYM, END_PHONE_SYM]
    with open(CMU_SYMBOLS_PATH) as file:
        for line in file: 
            phone_list.append(line.strip())
    return [''] + phone_list


def id_mappings_from_list(str_list):
    str_to_id = {s: i for i, s in enumerate(str_list)} 
    id_to_str = {i: s for i, s in enumerate(str_list)}
    return str_to_id, id_to_str


# Create character to ID mappings
char_to_id, id_to_char = id_mappings_from_list(char_list())

# Load phonetic symbols and create ID mappings
phone_to_id, id_to_phone = id_mappings_from_list(phone_list())

# Example:
print('Char to id mapping: \n', char_to_id)
print('Phone to id mapping: \n', phone_to_id)

Char to id mapping: 
 {'': 0, '.': 1, '-': 2, "'": 3, 'A': 4, 'B': 5, 'C': 6, 'D': 7, 'E': 8, 'F': 9, 'G': 10, 'H': 11, 'I': 12, 'J': 13, 'K': 14, 'L': 15, 'M': 16, 'N': 17, 'O': 18, 'P': 19, 'Q': 20, 'R': 21, 'S': 22, 'T': 23, 'U': 24, 'V': 25, 'W': 26, 'X': 27, 'Y': 28, 'Z': 29}
Phone to id mapping: 
 {'': 0, '\t': 1, '\n': 2, 'AA': 3, 'AA0': 4, 'AA1': 5, 'AA2': 6, 'AE': 7, 'AE0': 8, 'AE1': 9, 'AE2': 10, 'AH': 11, 'AH0': 12, 'AH1': 13, 'AH2': 14, 'AO': 15, 'AO0': 16, 'AO1': 17, 'AO2': 18, 'AW': 19, 'AW0': 20, 'AW1': 21, 'AW2': 22, 'AY': 23, 'AY0': 24, 'AY1': 25, 'AY2': 26, 'B': 27, 'CH': 28, 'D': 29, 'DH': 30, 'EH': 31, 'EH0': 32, 'EH1': 33, 'EH2': 34, 'ER': 35, 'ER0': 36, 'ER1': 37, 'ER2': 38, 'EY': 39, 'EY0': 40, 'EY1': 41, 'EY2': 42, 'F': 43, 'G': 44, 'HH': 45, 'IH': 46, 'IH0': 47, 'IH1': 48, 'IH2': 49, 'IY': 50, 'IY0': 51, 'IY1': 52, 'IY2': 53, 'JH': 54, 'K': 55, 'L': 56, 'M': 57, 'N': 58, 'NG': 59, 'OW': 60, 'OW0': 61, 'OW1': 62, 'OW2': 63, 'OY': 64, 'OY0': 65, 'OY1': 66, 'OY2':

In [5]:
CHAR_TOKEN_COUNT = len(char_to_id)
PHONE_TOKEN_COUNT = len(phone_to_id)


def char_to_1_hot(char):
    char_id = char_to_id[char]
    hot_vec = np.zeros((CHAR_TOKEN_COUNT))
    hot_vec[char_id] = 1.
    return hot_vec


def phone_to_1_hot(phone):
    phone_id = phone_to_id[phone]
    hot_vec = np.zeros((PHONE_TOKEN_COUNT))
    hot_vec[phone_id] = 1.
    return hot_vec

# Example:
print('"A" is represented by:\n', char_to_1_hot('A'), '\n-----')
print('"AH0" is represented by:\n', phone_to_1_hot('AH0'))

"A" is represented by:
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.] 
-----
"AH0" is represented by:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [6]:
MAX_CHAR_SEQ_LEN = max([len(word) for word, _ in phonetic_dict.items()])
MAX_PHONE_SEQ_LEN = max([max([len(pron.split()) for pron in pronuns]) 
                         for _, pronuns in phonetic_dict.items()]
                       ) + 2  # + 2 to account for the start & end tokens we need to add


def dataset_to_1_hot_tensors():
    char_seqs = []
    phone_seqs = []
    
    for word, pronuns in phonetic_dict.items():
        word_matrix = np.zeros((MAX_CHAR_SEQ_LEN, CHAR_TOKEN_COUNT))
        for t, char in enumerate(word):
            word_matrix[t, :] = char_to_1_hot(char)
        for pronun in pronuns:
            pronun_matrix = np.zeros((MAX_PHONE_SEQ_LEN, PHONE_TOKEN_COUNT))
            phones = [START_PHONE_SYM] + pronun.split() + [END_PHONE_SYM]
            for t, phone in enumerate(phones):
                pronun_matrix[t,:] = phone_to_1_hot(phone)
                
            char_seqs.append(word_matrix)
            phone_seqs.append(pronun_matrix)
    
    return np.array(char_seqs), np.array(phone_seqs)
            

char_seq_matrix, phone_seq_matrix = dataset_to_1_hot_tensors()        
print('Word Matrix Shape: ', char_seq_matrix.shape)
print('Pronunciation Matrix Shape: ', phone_seq_matrix.shape)

Word Matrix Shape:  (5304, 18, 30)
Pronunciation Matrix Shape:  (5304, 19, 87)


### Model

In [7]:
phone_seq_matrix_decoder_output = np.pad(phone_seq_matrix,((0,0),(0,1),(0,0)), mode='constant')[:,1:,:]

In [8]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

def baseline_model(hidden_nodes = 256):
    
    # Shared Components - Encoder
    char_inputs = Input(shape=(None, CHAR_TOKEN_COUNT))
    encoder = LSTM(hidden_nodes, return_state=True)
    
    # Shared Components - Decoder
    phone_inputs = Input(shape=(None, PHONE_TOKEN_COUNT))
    decoder = LSTM(hidden_nodes, return_sequences=True, return_state=True)
    decoder_dense = Dense(PHONE_TOKEN_COUNT, activation='softmax')
    
    # Training Model
    _, state_h, state_c = encoder(char_inputs) # notice encoder outputs are ignored
    encoder_states = [state_h, state_c]
    decoder_outputs, _, _ = decoder(phone_inputs, initial_state=encoder_states)
    phone_prediction = decoder_dense(decoder_outputs)

    training_model = Model([char_inputs, phone_inputs], phone_prediction)
    
    # Testing Model - Encoder
    testing_encoder_model = Model(char_inputs, encoder_states)
    
    # Testing Model - Decoder
    decoder_state_input_h = Input(shape=(hidden_nodes,))
    decoder_state_input_c = Input(shape=(hidden_nodes,))
    decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, decoder_state_h, decoder_state_c = decoder(phone_inputs, initial_state=decoder_state_inputs)
    decoder_states = [decoder_state_h, decoder_state_c]
    phone_prediction = decoder_dense(decoder_outputs)
    
    testing_decoder_model = Model([phone_inputs] + decoder_state_inputs, [phone_prediction] + decoder_states)
    
    return training_model, testing_encoder_model, testing_decoder_model

Using TensorFlow backend.


In [9]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2
    
(char_input_train, char_input_test, 
 phone_input_train, phone_input_test, 
 phone_output_train, phone_output_test) = train_test_split(
    char_seq_matrix, phone_seq_matrix, phone_seq_matrix_decoder_output, 
    test_size=TEST_SIZE, random_state=42)

TEST_EXAMPLE_COUNT = char_input_test.shape[0]

In [22]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

def train(model, weights_path, encoder_input, decoder_input, decoder_output):
    checkpointer = ModelCheckpoint(filepath=weights_path, verbose=1, save_best_only=True)
    stopper = EarlyStopping(monitor='val_loss',patience=3)
    
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit([encoder_input, decoder_input], decoder_output,
          batch_size=256,
          epochs=100,
          validation_split=0.2, # Keras will automatically create a validation set for us
          callbacks=[checkpointer, stopper])

In [23]:
BASELINE_MODEL_WEIGHTS = "/home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5"
training_model, testing_encoder_model, testing_decoder_model = baseline_model()

train(training_model, BASELINE_MODEL_WEIGHTS, char_input_train, phone_input_train, phone_output_train)

Train on 3394 samples, validate on 849 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.34253, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5


  '. They will not be included '


Epoch 2/100

Epoch 00002: val_loss improved from 1.34253 to 1.27796, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 1.27796 to 1.25871, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 1.25871 to 1.24639, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 1.24639 to 1.23543, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 1.23543 to 1.22191, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 7/100

Epoch 00007: val_loss impro


Epoch 00031: val_loss improved from 0.77213 to 0.74667, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 32/100

Epoch 00032: val_loss improved from 0.74667 to 0.74046, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 33/100

Epoch 00033: val_loss improved from 0.74046 to 0.70885, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 34/100

Epoch 00034: val_loss improved from 0.70885 to 0.69620, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 35/100

Epoch 00035: val_loss improved from 0.69620 to 0.67943, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 36/100

Epoch 00036: val_loss improved fro


Epoch 00062: val_loss did not improve from 0.48174
Epoch 63/100

Epoch 00063: val_loss did not improve from 0.48174
Epoch 64/100

Epoch 00064: val_loss improved from 0.48174 to 0.47993, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 65/100

Epoch 00065: val_loss did not improve from 0.47993
Epoch 66/100

Epoch 00066: val_loss did not improve from 0.47993
Epoch 67/100

Epoch 00067: val_loss improved from 0.47993 to 0.47547, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 68/100

Epoch 00068: val_loss did not improve from 0.47547
Epoch 69/100

Epoch 00069: val_loss improved from 0.47547 to 0.47492, saving model to /home/pavel/MyDocs/MachineLearning/Yandex_ML_project/lecture4/Kaggle_phonetics/baseline_model_weights.hdf5
Epoch 70/100

Epoch 00070: val_loss did not improve from 0.47492
Epoch 71/100

Epoch 00071: val_loss did not 

In [27]:
def predict_baseline(input_char_seq, encoder, decoder):
    state_vectors = encoder.predict(input_char_seq) 
    
    prev_phone = np.zeros((1, 1, PHONE_TOKEN_COUNT))
    prev_phone[0, 0, phone_to_id[START_PHONE_SYM]] = 1.
    
    end_found = False 
    pronunciation = '' 
    while not end_found:
        decoder_output, h, c = decoder.predict([prev_phone] + state_vectors)
        
        # Predict the phoneme with the highest probability
        predicted_phone_idx = np.argmax(decoder_output[0, -1, :])
        predicted_phone = id_to_phone[predicted_phone_idx]
        
        pronunciation += predicted_phone + ' '
        
        if predicted_phone == END_PHONE_SYM or len(pronunciation.split()) > MAX_PHONE_SEQ_LEN: 
            end_found = True
        
        # Setup inputs for next time step
        prev_phone = np.zeros((1, 1, PHONE_TOKEN_COUNT))
        prev_phone[0, 0, predicted_phone_idx] = 1.
        state_vectors = [h, c]
        
    return pronunciation.strip()

In [28]:
# Helper method for converting vector representations back into words
def one_hot_matrix_to_word(char_seq):
    word = ''
    for char_vec in char_seq[0]:
        if np.count_nonzero(char_vec) == 0:
            break
        hot_bit_idx = np.argmax(char_vec)
        char = id_to_char[hot_bit_idx]
        word += char
    return word


# Some words have multiple correct pronunciations
# If a prediction matches any correct pronunciation, consider it correct.
def is_correct(word,test_pronunciation):
    correct_pronuns = phonetic_dict[word]
    for correct_pronun in correct_pronuns:
        if test_pronunciation == correct_pronun:
            return True
    return False


def sample_baseline_predictions(sample_count, word_decoder):
    sample_indices = random.sample(range(TEST_EXAMPLE_COUNT), sample_count)
    for example_idx in sample_indices:
        example_char_seq = char_input_test[example_idx:example_idx+1]
        predicted_pronun = predict_baseline(example_char_seq, testing_encoder_model, testing_decoder_model)
        example_word = word_decoder(example_char_seq)
        pred_is_correct = is_correct(example_word, predicted_pronun)
        print('✅ ' if pred_is_correct else '❌ ', example_word,'-->', predicted_pronun)

In [29]:
training_model.load_weights(BASELINE_MODEL_WEIGHTS)  # also loads weights for testing models
sample_baseline_predictions(10, one_hot_matrix_to_word)

❌  MAXTOR --> K AA1 M P T R AH0 S
❌  BURBLED --> B ER1 L D
❌  ABOUT --> OW1 B AH0 T
❌  HARIS --> HH EH1 R S
❌  BARNETT --> B AA1 R T AH0 N
❌  WERNET --> W ER1 N AH0 T
❌  YELLEN --> IY1 L IH0 N
❌  TRANSGRESSIONS --> T R AE0 S N AA1 JH EH1 R Z
✅  CHEZ --> CH EH1 Z
❌  CUONG --> K OW1 N IH0 NG


In [30]:
def syllable_count(phonetic_sp): 
    count = 0
    for phone in phonetic_sp.split(): 
        if phone[-1].isdigit():
            count += 1 
    return count

# Examples:
for ex_word in list(phonetic_dict.keys())[:3]:
    print(ex_word, '--', syllable_count(phonetic_dict[ex_word][0]), 'syllables')

FURR'S -- 1 syllables
CONTRASTS -- 2 syllables
MEIDINGER -- 3 syllables


In [31]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def is_syllable_count_correct(word, test_pronunciation):
    correct_pronuns = phonetic_dict[word]
    for correct_pronun in correct_pronuns:
        if syllable_count(test_pronunciation) == syllable_count(correct_pronun):
            return True
    return False
    
    
def bleu_score(word,test_pronunciation):
    references = [pronun.split() for pronun in phonetic_dict[word]]
    smooth = SmoothingFunction().method1
    return sentence_bleu(references, test_pronunciation.split(), smoothing_function=smooth)


def evaluate(test_examples, encoder, decoder, word_decoder, predictor):
    correct_syllable_counts = 0
    perfect_predictions = 0
    bleu_scores = []
    
    for example_idx in range(TEST_EXAMPLE_COUNT):
        example_char_seq = test_examples[example_idx:example_idx+1]
        predicted_pronun = predictor(example_char_seq, encoder, decoder)
        example_word = word_decoder(example_char_seq)
        
        perfect_predictions += is_correct(example_word,predicted_pronun)
        correct_syllable_counts += is_syllable_count_correct(example_word,predicted_pronun)

        bleu = bleu_score(example_word,predicted_pronun)
        bleu_scores.append(bleu)
        
    syllable_acc = correct_syllable_counts / TEST_EXAMPLE_COUNT
    perfect_acc = perfect_predictions / TEST_EXAMPLE_COUNT
    avg_bleu_score = np.mean(bleu_scores)
    
    return syllable_acc, perfect_acc, avg_bleu_score


def print_results(model_name, syllable_acc, perfect_acc, avg_bleu_score):
    print(model_name)
    print('-'*20)
    print('Syllable Accuracy: %s%%' % round(syllable_acc*100, 1))
    print('Perfect Accuracy: %s%%' % round(perfect_acc*100, 1))
    print('Bleu Score: %s' % round(avg_bleu_score, 4))


In [32]:
syllable_acc, perfect_acc, avg_bleu_score = evaluate(
    char_input_test, testing_encoder_model, testing_decoder_model, one_hot_matrix_to_word, predict_baseline)
print_results('Baseline Model',syllable_acc, perfect_acc, avg_bleu_score)

Baseline Model
--------------------
Syllable Accuracy: 67.3%
Perfect Accuracy: 11.0%
Bleu Score: 0.2567
