In [2]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

import pandas as pd
import numpy as np
from scipy import stats

from sklearn import metrics
import tensorflow as tf

from nltk.tokenize import WordPunctTokenizer

DATA_PATH = "../data/"
CHECKPOINT_DIR = "saved_models/weights.{epoch:02d}-{val_loss:.2f}.hdf5"

#### Data Processing

In [3]:
train = pd.read_csv(DATA_PATH + "df_train_big.csv", index_col = 0).drop('file', axis = 1)
dev = pd.read_csv(DATA_PATH + "df_dev.csv", index_col = 0).drop('file', axis = 1)
test = pd.read_csv(DATA_PATH + "df_test.csv", index_col = 0).drop('file', axis = 1)

In [4]:
idx = np.random.randint(0, train.shape[0], 1)[0]
print('Text:', train.loc[idx, 'text'])
print('Label:', train.loc[idx, 'label'])

Text: Uzlaştırıcı politikasını cumhurbaşkanlığı döneminde de sürdürmesine karşın savaş sonunda Fransa ' da gerilimlerin önü alınamadı .
Label: O O O O O O O O O LOC O O O O O O


In [5]:
X_train_ = train.loc[:, 'text']
y_train_ = train.loc[:, 'label']

X_dev_ = dev.loc[:, 'text']
y_dev_ = dev.loc[:, 'label']

X_test_ = test.loc[:, 'text']
y_test_ = test.loc[:, 'label']

#### Char level processing

In [6]:
def char_level_preprocess_string(string_X, string_y):
    X_ = []
    y_ = []
    tokens = string_X.split()
    labels = string_y.split()
    for token, label in zip(tokens, labels):
        chars_of_token = list(token) + [' ']
        labels_of_chars_of_token = len(chars_of_token) * [label]

        X_ += chars_of_token
        y_ += labels_of_chars_of_token

    # Remove last whitespace, it is product of code
    del X_[-1]
    del y_[-1]
    
    return X_, y_

def char_level_preprocess_df(df_X, df_y):
    X = []
    y = []
    for idx in range(df_X.shape[0]):
        X_ = []
        y_ = []
        tokens = df_X.loc[idx]
        labels = df_y.loc[idx]
        
        X_, y_ = char_level_preprocess_string(tokens, labels)

        X.append(X_)
        y.append(y_)
        
    return X, y

In [7]:
X_train, y_train = char_level_preprocess_df(X_train_, y_train_)
X_dev, y_dev = char_level_preprocess_df(X_dev_, y_dev_)
X_test, y_test = char_level_preprocess_df(X_test_, y_test_)

In [8]:
vocab_size = 140
seq_len = 256
oov_token = '<OOV>'
padding_strat = 'post'

In [9]:
tokenizer_X = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size, oov_token = oov_token, 
                                                    filters = '', lower = False, char_level= True)
tokenizer_y = tf.keras.preprocessing.text.Tokenizer(filters = '', lower = False)

tokenizer_X.fit_on_texts(X_train)
tokenizer_y.fit_on_texts(y_train)

X_train = tokenizer_X.texts_to_sequences(X_train)
X_dev = tokenizer_X.texts_to_sequences(X_dev)
X_test = tokenizer_X.texts_to_sequences(X_test)

y_train = tokenizer_y.texts_to_sequences(y_train)
y_dev = tokenizer_y.texts_to_sequences(y_dev)
y_test = tokenizer_y.texts_to_sequences(y_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen = seq_len, padding = padding_strat)
X_dev = tf.keras.preprocessing.sequence.pad_sequences(X_dev, maxlen = seq_len, padding = padding_strat)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen = seq_len, padding = padding_strat)

y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train, maxlen = seq_len, padding = padding_strat)
y_dev = tf.keras.preprocessing.sequence.pad_sequences(y_dev, maxlen = seq_len, padding = padding_strat)
y_test = tf.keras.preprocessing.sequence.pad_sequences(y_test, maxlen = seq_len, padding = padding_strat)

#### RNN Model

In [10]:
embed_size = 32
rnn_dim = 128
num_rnn_stacks = 5
mlp_dim = 32
num_classes = len(tokenizer_y.index_word) + 1
dropout = 0.3

In [11]:
class CustomNonPaddingTokenLoss(tf.keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embed_size, input_length=seq_len)) #mask_zero=True, input_dim = vocab_size + 1

for _ in range(num_rnn_stacks):
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(rnn_dim, return_sequences = True)))
    model.add(tf.keras.layers.Dropout(dropout))

model.add(tf.keras.layers.Dense(mlp_dim, activation = 'relu'))
model.add(tf.keras.layers.Dropout(dropout))
model.add(tf.keras.layers.Dense(num_classes, activation = 'softmax'))

In [12]:
loss = CustomNonPaddingTokenLoss()
model.compile(optimizer='adam', loss = loss)

In [None]:
%%time
epochs = 100
batch_size = 256
verbose = 1
patience = 5

early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = patience, verbose = verbose, restore_best_weights= True)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = CHECKPOINT_DIR, save_freq = 'epoch', save_weights_only = True, verbose = 1)

history = model.fit(X_train, y_train, batch_size, epochs, verbose, callbacks = [model_checkpoint_callback, early_stopping], validation_data = (X_dev, y_dev))

Epoch 1/100

Epoch 00001: saving model to saved_models\weights.01-0.31.hdf5
Epoch 2/100

Epoch 00002: saving model to saved_models\weights.02-0.25.hdf5
Epoch 3/100

Epoch 00003: saving model to saved_models\weights.03-0.23.hdf5
Epoch 4/100

Epoch 00004: saving model to saved_models\weights.04-0.22.hdf5
Epoch 5/100

Epoch 00005: saving model to saved_models\weights.05-0.22.hdf5
Epoch 6/100

Epoch 00006: saving model to saved_models\weights.06-0.21.hdf5
Epoch 7/100

Epoch 00007: saving model to saved_models\weights.07-0.20.hdf5
Epoch 8/100

Epoch 00008: saving model to saved_models\weights.08-0.20.hdf5
Epoch 9/100

Epoch 00009: saving model to saved_models\weights.09-0.20.hdf5
Epoch 10/100

Epoch 00010: saving model to saved_models\weights.10-0.20.hdf5
Epoch 11/100

Epoch 00011: saving model to saved_models\weights.11-0.19.hdf5
Epoch 12/100

Epoch 00012: saving model to saved_models\weights.12-0.20.hdf5
Epoch 13/100

Dropout 0.3: min_val_loss: 0.1579

Dropout 0.2: min_val_loss: 0.1581

In [16]:
saved_weights = pd.Series(os.listdir('saved_models'))
weight_to_load = saved_weights[saved_weights.str.contains('hdf5')].values[0]
model.load_weights('saved_models/' + weight_to_load)

In [17]:
# This will turn into Inference Pipeline
def predict_char_level(word_punct_tokenized):
    """
    Input:
    word_punct_tokenized List(str): List of tokens (WordPunct level)
    i.e: ["İstanbul", "'", "da", "yaşıyorum", "."]
    """
    white_space_joined_word_punct_tokens = " ".join(word_punct_tokenized)
    sequences = tokenizer_X.texts_to_sequences([white_space_joined_word_punct_tokens])
    padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = seq_len, padding = padding_strat)
    raw_pred = model.predict([padded])
    arg_max_pred = tf.math.argmax(raw_pred, axis = 2).numpy().reshape(-1)
    
    return arg_max_pred #tokenizer_y.sequences_to_texts([arg_max_pred])

def charner_decoder(word_punct_tokenized, arg_max_pred):
    """
    Input:
    word_punct_tokenized: List(str) : List of tokens (WordPunct level)
    i.e: ["İstanbul", "'", "da", "yaşıyorum", "."]
    
    arg_max_pred: List(int) : argmax(axis = -1) of model output
    
    Output:
    decoded_entities: List(str) : List of entities, one entity per token
    """
    
    lens = [0] + [len(token) + 1 for token in word_punct_tokenized]
    cumsum_of_lens = np.cumsum(lens)
    
    decoded_entities = []
    for idx in range(len(cumsum_of_lens) - 1):
        lower_bound = cumsum_of_lens[idx]
        upper_bound = cumsum_of_lens[idx + 1]

        island = arg_max_pred[lower_bound:upper_bound]
        mode_value = stats.mode(island).mode[0]
        detokenized_pred = tokenizer_y.sequences_to_texts([[mode_value]])[0]
        decoded_entities.append(detokenized_pred)
        
    return decoded_entities

def pipeline(text):
    word_punct_tokenized = WordPunctTokenizer().tokenize(text)

    # if len chars (including whitespaces) > sequence length, split it recursively
    len_text = len(list(" ".join(word_punct_tokenized)))
    if len_text > seq_len:
        
        num_tokens = len(word_punct_tokenized)
        first_half_tokens, first_half_entities = pipeline(" ".join(word_punct_tokenized[:num_tokens // 2]))
        second_half_tokens, second_half_entities = pipeline(" ".join(word_punct_tokenized[(num_tokens // 2):]))

        word_punct_tokenized = first_half_tokens + second_half_tokens
        decoded_entities = first_half_entities + second_half_entities

    else:
        charlevel_pred = predict_char_level(word_punct_tokenized)
        decoded_entities = charner_decoder(word_punct_tokenized, charlevel_pred)
        
    return word_punct_tokenized, decoded_entities

In [18]:
%%time
y_test = []
y_hat = []
for idx in test.index.values:
    text = test.loc[idx, 'text']
    #len_text = len(list(text))
    #if len_text <= seq_len:
    label = test.loc[idx, 'label'].split()

    _, pred = pipeline(text)

    y_test += label
    y_hat += pred
    
assert len(y_test) == len(y_hat)

Wall time: 40min 32s


In [19]:
def calculate_metrics(y_test, y_hat):
    accuracy = metrics.accuracy_score(y_test, y_hat)
    f1_macro = metrics.f1_score(y_test, y_hat, average = 'macro')
    f1_micro = metrics.f1_score(y_test, y_hat, average = 'micro')
    
    df_results = pd.DataFrame([accuracy, f1_macro, f1_micro], index = ['accuracy', 'f1_macro', 'f1_micro'], columns = ['value']).T
    return df_results

#### 0.3 dropout result

In [20]:
calculate_metrics(y_test, y_hat)

Unnamed: 0,accuracy,f1_macro,f1_micro
value,0.963643,0.911957,0.963643


#### 0.2 dropout result

In [45]:
calculate_metrics(y_test, y_hat)

Unnamed: 0,accuracy,f1_macro,f1_micro
value,0.962474,0.908618,0.962474


#### SavasYildirim BERTNER

In [60]:
df_wikiann_test = pd.read_csv(DATA_PATH + "df_test.csv", index_col = 0)
df_wikiann_test = df_wikiann_test.loc[df_wikiann_test['file'] == 'wikiann-test.txt'].reset_index(drop = True).copy()

In [63]:
%%time
y_test = []
y_hat = []
for idx in df_wikiann_test.index.values:
    text = df_wikiann_test.loc[idx, 'text']
    #len_text = len(list(text))
    #if len_text <= seq_len:
    label = df_wikiann_test.loc[idx, 'label'].split()

    _, pred = pipeline(text)

    y_test += label
    y_hat += pred
    
assert len(y_test) == len(y_hat)

Wall time: 14min 22s


In [64]:
calculate_metrics(y_test, y_hat)

Unnamed: 0,accuracy,f1_macro,f1_micro
value,0.949852,0.90326,0.949852


**TODO:**
- Dropout rate arttırarak daha düşük bir val_loss elde etmeye çalış.
- TWNERTC ile ve olmadan da train edip val_loss'a bak.


**DONE:**

- BOYUTU 256'dan büyük olanları ortadan 2'ye bölüp 2 kez yap. Bunu recursive br şekilde yapıp birleştirebilirim
- Literatürden modellerle kıyaslamaya çalış
- Biraz daha büyük bir model train etsem sonuç nasıl değişir?
- CustomLoss ile değil de default catloss ile eğitsem sonuç nasıl değişir?
- mask_zero ile eğitsem sonuç nasıl değişir?
- Word-Level train etsem sonuç nasıl değişir?
- RNN size 128 units yapıp 5 layer ile train et
- savasyildirim'in metrik'leri ile kıyasla
- Aynı architecture ile WORDNer train edip metric'lere bak: CharNER is slightly better than WORDNer

In [53]:
text = "Meryem Beşer ile birlikte önce Bursa'ya oradan da İzmir'e gittik, haftasonunu Foça'da geçirdik."
tokens, entities = pipeline(text)
[(t,e) for t,e in zip(tokens, entities)]

[('Meryem', 'PER'),
 ('Beşer', 'PER'),
 ('ile', 'O'),
 ('birlikte', 'O'),
 ('önce', 'O'),
 ('Bursa', 'LOC'),
 ("'", 'O'),
 ('ya', 'O'),
 ('oradan', 'O'),
 ('da', 'O'),
 ('İzmir', 'LOC'),
 ("'", 'O'),
 ('e', 'O'),
 ('gittik', 'O'),
 (',', 'O'),
 ('haftasonunu', 'O'),
 ('Foça', 'LOC'),
 ("'", 'O'),
 ('da', 'O'),
 ('geçirdik', 'O'),
 ('.', 'O')]

In [49]:
text = "Cumhurbaşkanı Ahmet Necdet Sezer'in açıklamalarına göre Ankara'daki TBMM 3 gün daha tatil kalacak"
tokens, entities = pipeline(text)
[(t,e) for t,e in zip(tokens, entities)]

[('Cumhurbaşkanı', 'O'),
 ('Ahmet', 'PER'),
 ('Necdet', 'PER'),
 ('Sezer', 'PER'),
 ("'", 'O'),
 ('in', 'O'),
 ('açıklamalarına', 'O'),
 ('göre', 'O'),
 ('Ankara', 'LOC'),
 ("'", 'O'),
 ('daki', 'O'),
 ('TBMM', 'ORG'),
 ('3', 'O'),
 ('gün', 'O'),
 ('daha', 'O'),
 ('tatil', 'O'),
 ('kalacak', 'O')]

In [48]:
text = "Ben Melikşah, 28 yaşındayım, İstanbul'da ikamet ediyorum ve VNGRS AI Takımı'nda çalışıyorum"
tokens, entities = pipeline(text)
[(t,e) for t,e in zip(tokens, entities)]

[('Ben', 'O'),
 ('Melikşah', 'PER'),
 (',', 'O'),
 ('28', 'O'),
 ('yaşındayım', 'O'),
 (',', 'O'),
 ('İstanbul', 'LOC'),
 ("'", 'O'),
 ('da', 'O'),
 ('ikamet', 'O'),
 ('ediyorum', 'O'),
 ('ve', 'O'),
 ('VNGRS', 'ORG'),
 ('AI', 'ORG'),
 ('Takımı', 'ORG'),
 ("'", 'O'),
 ('nda', 'O'),
 ('çalışıyorum', 'O')]