In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
!pip install --quiet seqeval

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses, metrics
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

#from keras_contrib.layers import CRF
#from keras_contrib.losses import crf_loss
#from keras_contrib.metrics import crf_ac

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
file_path = "/kaggle/input/ner-datasetturkish/NER Dataset(Turkish)/"

In [None]:
train_df = pd.read_csv(os.path.join(file_path, "train.csv"))
test_df = pd.read_csv(os.path.join(file_path, "test.csv"))
dev_df = pd.read_csv(os.path.join(file_path, "dev.csv"))

In [None]:
df = pd.concat([train_df, test_df, dev_df], ignore_index=True)
df.head()

In [None]:
words = list(set(df["data"].values))
n_words = len(words)
print(n_words)

In [None]:
tags = list(set(df["entities"].values))
n_tags = len(tags)
print(n_tags)

# Preprocess

In [None]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["data"].values.tolist(),
                                                     s["entities"].values.tolist())]

        self.grouped = self.data.groupby("sent_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(df)
sent = getter.get_next()
print(sent)

In [None]:
sentences = getter.sentences

In [None]:
maxlen  = max([len(sentence) for sentence in sentences])
maxlen

In [None]:
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {i: w for w, i in tag2idx.items()}

In [None]:
X = [[word2idx[word[0]] for word in sentence] for sentence in sentences]
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post", value=n_words-1)

In [None]:
y = [[tag2idx[word[1]] for word in sentence] for sentence in sentences]
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Model

In [None]:
input_layer = layers.Input(shape=(maxlen,))
embedding_layer = layers.Embedding(input_dim=n_words, output_dim=300, input_length=maxlen)(input_layer)
bilstm_layer = layers.Bidirectional(layers.LSTM(units=300,
                                                return_sequences=True,
                                                dropout=0.5,
                                                recurrent_dropout=0.5,
                                                kernel_initializer="he_normal"))(embedding_layer)
lstm_layer = layers.LSTM(units=600,
                         return_sequences=True,
                         dropout=0.5,
                         recurrent_dropout=0.5,
                         kernel_initializer="he_normal")(bilstm_layer)
time_layer = layers.TimeDistributed(layers.Dense(n_tags,
                                                 activation="softmax"))(lstm_layer)

#crf_layer = CRF(n_tags)
#output_layer = crf_layer(time_layer)

model = models.Model(input_layer, time_layer)

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(
    X_train, 
    np.array(y_train),
    batch_size = 256,
    epochs = 5,
    validation_split=0.1,
)

# Results

In [None]:
history_df = pd.DataFrame(history.history)
history_df.head()

In [None]:
plt.figure()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["train", "valid"])
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(["train", "valid"])
plt.show()

In [None]:
test_loss, test_acc = model.evaluate(X_test, np.array(y_test), verbose=0)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

# Test

In [None]:
model_predictions = model.predict(X_test, verbose=0)

In [None]:
pred_labels = [[idx2tag[np.argmax(p)].replace("PAD", "O") for p in pred_i] for pred_i in model_predictions]
test_labels = [[idx2tag[np.argmax(t)].replace("PAD", "O") for t in test_i] for test_i in y_test]

In [None]:
print(classification_report(test_labels, pred_labels))

In [None]:
def pred_random_sentence(X_test, y_test):
    i = np.random.randint(1, len(X_test))
    model_prediction = model.predict(np.array([X_test[i]]), verbose=0)
    model_prediction = np.argmax(model_prediction, axis=-1)
    true_label = np.argmax(y_test[i], axis=-1)
    print("{:20} | {:10} | {}".format("Kelime", "Gerçek", "Tahmin"))
    print("-" * 40)
    for word, true, pred in zip(X_test[i], true_label, model_prediction[0]):
        if word != 0:
            print("{:20} | {:10} | {}".format(words[word-1], tags[true], tags[pred]))

In [None]:
pred_random_sentence(X_test, y_test)