In [1]:
import os
import pickle
import glob

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import metrics

from melik_utils import (create_model, convert_data_to_sentence_form, 
                         fit_tokenizer_char, fit_tokenizer_tag, process_data,
                         data_generator)
from yildiz_data_utils import sentence_generator, capitalize
from yildiz_analyzer import TurkishStemSuffixCandidateGenerator

CHECKPOINT_DIR = "saved_models/weights.{epoch:02d}-{val_loss:.4f}.hdf5"

train_files = glob.glob("data/train/*.txt", recursive = True)
test_files = glob.glob("data/test/*.txt", recursive = True)
all_files = train_files + test_files

#### Config

In [2]:
num_max_analysis = 10 # 0.99 quantile

# Stem
stem_max_len = 10 # 0.99 quantile
tokenizer_char_oov = '<OOV>'

# Tag
tag_max_len = 15 # 0.99 quantile
tokenizer_tag_oov = '<OOV>'

# Left and Right Surface Context
sentence_max_len = 40 # 0.95 quantile is 42
surface_token_max_len = 15 # 0.99 quantile

# Data preparing related
exclude_unambigious = False
shuffle = True

#### Data Preprocessing

In [3]:
#test = convert_data_to_sentence_form(test_files[0])
 
test = []
for file in test_files:
    for sentence in sentence_generator(file):
        test.append(sentence)

        
test_2006 = []
for sentence in sentence_generator('data/trmorph2006_test.txt'):
    test_2006.append(sentence)

print(len(test))
print(len(test_2006))

2090
42


In [4]:
# fitting tokenizers
"""
tokenizer_char = fit_tokenizer_char(all_files)
tokenizer_tag = fit_tokenizer_tag(all_files)


# saving tokenizers
with open('tokenizer_char.pickle', 'wb') as handle:
    pickle.dump(tokenizer_char, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('tokenizer_tag.pickle', 'wb') as handle:
    pickle.dump(tokenizer_tag, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

# loading tokenizers 
with open('tokenizer_char.pickle', 'rb') as handle:
    tokenizer_char = pickle.load(handle)
    
with open('tokenizer_tag.pickle', 'rb') as handle:
    tokenizer_tag = pickle.load(handle)


In [5]:
#(X_train, y_train) = process_data(train, tokenizer_char, tokenizer_tag, stem_max_len, tag_max_len, surface_token_max_len,
#                                  sentence_max_len, num_max_analysis, exclude_unambigious, shuffle = True)
#(X_dev, y_dev) = process_data(dev, tokenizer_char, tokenizer_tag, stem_max_len, tag_max_len, surface_token_max_len,
#                                  sentence_max_len, num_max_analysis, exclude_unambigious, shuffle = True)
(X_test, y_test) = process_data(test, tokenizer_char, tokenizer_tag, stem_max_len, tag_max_len, surface_token_max_len,
                                  sentence_max_len, num_max_analysis, exclude_unambigious, shuffle = True)
(X_test_2006, y_test_2006) = process_data(test_2006, tokenizer_char, tokenizer_tag, stem_max_len, tag_max_len, surface_token_max_len,
                                  sentence_max_len, num_max_analysis, exclude_unambigious, shuffle = True)

#### Model

In [9]:
# Model Related
char_vocab_size = len(tokenizer_char.word_index) + 1 # 94 + 1  = 95
char_embed_size = 32
stem_num_rnn_units = 128
tag_vocab_size = len(tokenizer_tag.word_index) + 1 # 177 + 1 = 128
tag_embed_size = 32
tag_num_rnn_units = 128
embed_join_type = 'add'
dropout = 0.2
num_rnn_stacks = 1

model = create_model(num_max_analysis, stem_max_len, char_vocab_size, char_embed_size, stem_num_rnn_units,
                     tag_max_len, tag_vocab_size, tag_embed_size, tag_num_rnn_units,
                     sentence_max_len, surface_token_max_len, embed_join_type, dropout,
                     num_rnn_stacks)
print('Number of trainable parameters:', f'{model.count_params():,}')

Number of trainable parameters: 2,330,638


In [7]:
epochs = 100
batch_size = 128
verbose = 1
patience = 3

loss = tf.keras.losses.CategoricalCrossentropy()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = patience, verbose = verbose)
optimizer = tf.keras.optimizers.Adam()
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = CHECKPOINT_DIR, 
                                                save_freq = 'epoch', save_weights_only = True, verbose = 1)

model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

# Continuing from checkpoint
files = os.listdir('saved_models')
files = [file for file in files if ".hdf5" in file]
if len(files) > 0:
    last_epoch_was = pd.Series(files).str.split('.').str[1].str.split('-').str[0].astype(int).max()
    idx_of_weight_to_load = pd.Series(files).str.split('.').str[1].str.split('-').str[0].astype(int).idxmax()

    weight_to_load = files[idx_of_weight_to_load]
    print('Loading checkpoint of last epoch:', weight_to_load)
    model.load_weights('saved_models/' + weight_to_load)
else:
    last_epoch_was = 0

# Fitting
model.fit(data_generator(train_files, batch_size, tokenizer_char, tokenizer_tag, stem_max_len, tag_max_len, surface_token_max_len, 
                sentence_max_len, num_max_analysis, exclude_unambigious, shuffle), epochs = epochs, batch_size = batch_size, verbose = verbose, 
          callbacks = [checkpoint, early_stopping], shuffle = False, validation_data = (X_test, y_test),
          steps_per_epoch = (1_299_050 // batch_size), initial_epoch = last_epoch_was)
# 837_518 is the number of valid tokens in OnurGungor data
# 1_299_050 is for trmor2006, trmor2016, trmor2018

Epoch 1/100

Epoch 00001: saving model to saved_models\weights.01-0.2385.hdf5
Epoch 2/100

Epoch 00002: saving model to saved_models\weights.02-0.2065.hdf5
Epoch 3/100

Epoch 00003: saving model to saved_models\weights.03-0.1872.hdf5
Epoch 4/100

Epoch 00004: saving model to saved_models\weights.04-0.1929.hdf5
Epoch 5/100

Epoch 00005: saving model to saved_models\weights.05-0.1881.hdf5
Epoch 6/100

Epoch 00006: saving model to saved_models\weights.06-0.1832.hdf5
Epoch 7/100

Epoch 00007: saving model to saved_models\weights.07-0.1835.hdf5
Epoch 8/100

Epoch 00008: saving model to saved_models\weights.08-0.1819.hdf5
Epoch 9/100

Epoch 00009: saving model to saved_models\weights.09-0.1936.hdf5
Epoch 10/100

Epoch 00010: saving model to saved_models\weights.10-0.1842.hdf5
Epoch 11/100

Epoch 00011: saving model to saved_models\weights.11-0.1879.hdf5
Epoch 00011: early stopping


<tensorflow.python.keras.callbacks.History at 0xf2c7c568c8>

In [15]:
files = os.listdir('saved_models')
files = [file for file in files if ".hdf5" in file]
files_ = pd.Series(files).str.split(".", expand = True)
idx_of_min_weight = files_.loc[:, 2].astype(int).idxmin()
weight_to_load = files[idx_of_min_weight]
print('Loading checkpoint:', weight_to_load)
model.load_weights('saved_models/' + weight_to_load)

Loading checkpoint: weights.08-0.1751.hdf5


#### TrMorph2006 test dataset

In [130]:
ambiguity_levels = []
for idx in range(X_test_2006[0].shape[0]):
    ambiguity_level = (X_test_2006[0][idx].sum(axis = 1) != 0).sum()
    ambiguity_levels.append(ambiguity_level)
ambiguity_levels = np.array(ambiguity_levels)

ambigious_indices = ambiguity_levels != 1

- 2x128 gru results: 
    - [0.20262235403060913, 0.9410681128501892]
    - [0.12822052836418152, 0.9628770351409912]

In [131]:
# Results of ambigious ones only.
# This is 91.03 on Shen et al.
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.evaluate([X[ambigious_indices] for X in X_test_2006], y_test_2006[ambigious_indices])



[0.17724522948265076, 0.9429097771644592]

In [132]:
# Results with all.
# This is 96.41 on Shen et al.
model.evaluate(X_test_2006, y_test_2006)



[0.11200806498527527, 0.9640371203422546]

#### TrMorph2006 and TrMorph2018 test dataset

- 2x128 gru results: 
    - [0.2547953724861145, 0.924221932888031]
    - [0.19014544785022736, 0.9436220526695251]

In [133]:
ambiguity_levels = []
for idx in range(X_test[0].shape[0]):
    ambiguity_level = (X_test[0][idx].sum(axis = 1) != 0).sum()
    ambiguity_levels.append(ambiguity_level)
ambiguity_levels = np.array(ambiguity_levels)

ambigious_indices = ambiguity_levels != 1

In [134]:
# Results of ambigious ones only.
model.evaluate([X[ambigious_indices] for X in X_test], y_test[ambigious_indices])



[0.24949999153614044, 0.9258257150650024]

In [21]:
# Results with all.
# This is 96.41 on Shen et al.
model.evaluate(X_test, y_test)



[0.17537835240364075, 0.9432119131088257]