In [1]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
     |████████████████████████████████| 24.1 MB 4.4 MB/s            
Collecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
     |████████████████████████████████| 58 kB 13.2 MB/s            
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.1.2 smart-open-5.2.1


In [2]:
import os
import pickle
import glob

import pandas as pd
import numpy as np
import tensorflow as tf

from gensim.models import Word2Vec, FastText

from utils import (create_dependency_parser_model, process_single_sentence, data_generator, LAS, UAS, fit_label_tokenizer, load_triplets)

CHECKPOINT_DIR = "saved_models/weights.{epoch:02d}-{loss:.4f}.hdf5"

In [3]:
files = glob.glob("UD/refined-UD-Turkish/**//*.conllu", recursive = True)

train_files = [file for file in files if "train" in file]
dev_files = [file for file in files if "dev" in file]
test_files = [file for file in files if "test" in file]
"""
train_files = [file for file in files if "tr_imst-ud-test" not in file]
test_files = [file for file in files if "tr_imst-ud-test" in file]
"""

SENTENCE_MAX_LEN = 40
TAG_MAX_LEN = 15

word_oov_token = '<OOV>'
word_embedding_model = 'Word2Vec_medium.model'

"""
# Fit tokenizer_label once only
tokenizer_label = fit_label_tokenizer(files)

# saving tokenizer_label
with open('tokenizer_label.pickle', 'wb') as handle:
    pickle.dump(tokenizer_label, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

with open('tokenizer_label.pickle', 'rb') as handle:
    tokenizer_label = pickle.load(handle)
    
with open('tokenizer_tag.pickle', 'rb') as handle: # This is transferred from StemmerAnalyzer
    tokenizer_tag = pickle.load(handle)
    
sentence_max_len = SENTENCE_MAX_LEN
tag_max_len = TAG_MAX_LEN
arc_label_vector_len = sentence_max_len + len(tokenizer_label.word_index) + 1

#### Pre-Trained Word Embeddings

In [4]:
# Word2Vec
ft_model = Word2Vec.load(word_embedding_model)
word_embedding_vocab_size = len(ft_model.wv.key_to_index)
word_embedding_vector_size = ft_model.vector_size

embedding_vectors_ = ft_model.wv.vectors
embedding_dict_ = ft_model.wv.key_to_index

embedding_dict = {word_oov_token: 0}
word_embedding_matrix = np.zeros((word_embedding_vocab_size + 1, word_embedding_vector_size))
for k,v in embedding_dict_.items():
    embedding_dict[k] = v + 1
    word_embedding_matrix[v] = embedding_vectors_[v-1]
    
tokenizer_word = tf.keras.preprocessing.text.Tokenizer(filters = None, lower = False, oov_token = word_oov_token)
tokenizer_word.word_index = embedding_dict
tokenizer_word.index_word = {value:key for key, value in embedding_dict.items()}

"""
# saving tokenizer_label
with open('tokenizer_word.pickle', 'wb') as handle:
    pickle.dump(tokenizer_word, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""
print('NOT saving tokenizer_word until it is finalized')

NOT saving tokenizer_word until it is finalized


#### Preparing Test Data

In [5]:
"""
test_data = load_triplets(test_files)
batch_of_words = []
batch_of_tags = []
batch_of_arc_label_vectors = []

batch_of_left_context_words = []
batch_of_left_context_tags = []
batch_of_left_context_arc_label_vectors = []

batch_of_right_context_words = []
batch_of_right_context_tags = []
for sentence in test_data:
    # I won't use sentences longer than sentence_max_len for training
    # because first token can depend on last token and truncating and breaking
    # the tree structure can hurt training.
    # Also there's no way to output arc greater than sentence_max_len
    # so I will skip them
    if len(sentence) > sentence_max_len:
        continue
    else:
        (batch_of_words_, batch_of_tags_, batch_of_arc_label_vectors_, batch_of_left_context_words_,
        batch_of_left_context_tags_, batch_of_left_context_arc_label_vectors_, batch_of_right_context_words_,
        batch_of_right_context_tags_) = process_single_sentence(sentence, sentence_max_len, 
                                                                tag_max_len, arc_label_vector_len, 
                                                                tokenizer_word, tokenizer_tag, 
                                                                tokenizer_label)
        batch_of_words += batch_of_words_
        batch_of_tags += batch_of_tags_
        batch_of_arc_label_vectors += batch_of_arc_label_vectors_

        batch_of_left_context_words += batch_of_left_context_words_
        batch_of_left_context_tags += batch_of_left_context_tags_
        batch_of_left_context_arc_label_vectors += batch_of_left_context_arc_label_vectors_

        batch_of_right_context_words += batch_of_right_context_words_
        batch_of_right_context_tags += batch_of_right_context_tags_

batch_of_words = np.array(batch_of_words)
batch_of_tags = np.array(batch_of_tags)
batch_of_arc_label_vectors = np.array(batch_of_arc_label_vectors)

batch_of_left_context_words = np.array(batch_of_left_context_words)
batch_of_left_context_tags = np.array(batch_of_left_context_tags)
batch_of_left_context_arc_label_vectors = np.array(batch_of_left_context_arc_label_vectors)

batch_of_right_context_words = np.array(batch_of_right_context_words)
batch_of_right_context_tags = np.array(batch_of_right_context_tags)

X_test = (batch_of_words, batch_of_tags, batch_of_left_context_words, batch_of_left_context_tags,
        batch_of_left_context_arc_label_vectors, batch_of_right_context_words, batch_of_right_context_tags)
y_test = batch_of_arc_label_vectors
"""

"""
# saving exhausts diskspace
X_test_y_test = (X_test, y_test)
with open('X_test_y_test.pickle', 'wb') as f:
    pickle.dump(X_test_y_test, f)

with open('X_test_y_test.pickle', 'rb') as handle:
    (X_test, y_test) = pickle.load(handle)
"""

"\n# saving exhausts diskspace\nX_test_y_test = (X_test, y_test)\nwith open('X_test_y_test.pickle', 'wb') as f:\n    pickle.dump(X_test_y_test, f)\n\nwith open('X_test_y_test.pickle', 'rb') as handle:\n    (X_test, y_test) = pickle.load(handle)\n"

In [6]:
num_rnn_stacks = 2
rnn_units_multiplier = 2
tag_num_rnn_units = word_embedding_vector_size
lc_num_rnn_units = tag_num_rnn_units * rnn_units_multiplier
lc_arc_label_num_rnn_units = tag_num_rnn_units * rnn_units_multiplier
rc_num_rnn_units = tag_num_rnn_units * rnn_units_multiplier
dropout = 0.2

In [10]:
model = create_dependency_parser_model(word_embedding_vocab_size, word_embedding_vector_size, word_embedding_matrix,
                                   sentence_max_len, tag_max_len, arc_label_vector_len, num_rnn_stacks, 
                                   tag_num_rnn_units, lc_num_rnn_units, lc_arc_label_num_rnn_units, rc_num_rnn_units,
                                   dropout)

In [None]:
batch_size = 128
epochs = 16 # determined by monitoring val_loss during development phase
verbose = 1
patience = 3

base_lr = 0.001
decay_rate = 0.95
def scheduler(epoch, _):
    if epoch < 2:
        return base_lr
    else:
        lr = base_lr * (decay_rate ** epoch)
        return lr
    
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = patience, verbose = verbose)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = CHECKPOINT_DIR, 
                                                save_freq = 'epoch', save_weights_only = True, verbose = 1)

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = [LAS, UAS], run_eagerly = True)

# Continuing from checkpoint
weight_files = os.listdir('saved_models')
weight_files = [file for file in weight_files if ".hdf5" in file]
if len(weight_files) > 0:
    last_epoch_was = pd.Series(weight_files).str.split('.').str[1].str.split('-').str[0].astype(int).max()
    idx_of_weight_to_load = pd.Series(weight_files).str.split('.').str[1].str.split('-').str[0].astype(int).idxmax()

    weight_to_load = weight_files[idx_of_weight_to_load]
    print('Loading checkpoint of last epoch:', weight_to_load)
    model.load_weights('saved_models/' + weight_to_load)
else:
    last_epoch_was = 0

model.fit(data_generator(train_files + dev_files + test_files, tokenizer_word, tokenizer_tag, tokenizer_label, sentence_max_len, tag_max_len, arc_label_vector_len, batch_size), 
          batch_size = batch_size, epochs = epochs, verbose = verbose, steps_per_epoch = ((609_125 + 67_377 + 106_825) // batch_size),
          initial_epoch = last_epoch_was, callbacks = [checkpoint, lr_scheduler]) #validation_data = (X_test, y_test), 

# Number of Words in:
# train_files: 609_125
# dev_files: 67_377
# test_files: 106_825

Epoch 1/16
 969/6119 [===>..........................] - ETA: 1:13:40 - loss: 0.0559 - LAS: 0.3367 - UAS: 0.4371