In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Required Libraries

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

import tqdm
import nltk
import re
import io
import string
import emoji
import gensim

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers, models, optimizers, losses, regularizers, metrics, initializers, constraints
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

from IPython.display import Markdown

def bold(string):
    display(Markdown("**" + string + "**"))

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
df = pd.read_csv("/kaggle/input/turkish-news-article/TurkishNewsArticles.csv", usecols=["author", "text"])

In [None]:
df.head()

In [None]:
def df_stats(data):
    bold(" SHAPE ".center(50, "#"))
    print("ROWS: {}".format(data.shape[0]))
    print("COLS: {}".format(data.shape[1]))
    bold(" TYPES ".center(50, "#"))
    print(data.dtypes)
    bold(" MISSING VALUES ".center(50, "#"))
    print(data.isnull().sum())
    bold(" DUPLICATED VALUES ".center(50, "#"))
    print("NUMBER OF DUPLICATED VALUES: {}".format(data.duplicated().sum()))
    #bold(" DESCRIBE ".center(50, "#"))
    #print(data.describe().T)
    bold(" MEMORY USAGE ".center(50, "#"))
    buf = io.StringIO()
    data.info(buf=buf)
    info = buf.getvalue().split("\n")[-2].split(":")[1].strip()
    print("Memory Usage: {}".format(info))

In [None]:
df_stats(df)

In [None]:
df = df.dropna()
df = df.drop_duplicates()
df = df.sample(frac=1)
df = df.reset_index(drop=True)

# EDA

In [None]:
df["author"].value_counts().head(15).plot(kind="pie", autopct="%.1f%%")

In [None]:
author_counts = df['author'].value_counts()
authors_to_keep = author_counts[author_counts >= 20].index
df = df[df['author'].isin(authors_to_keep)]
df = df.reset_index(drop=True)

In [None]:
tags = Counter(df["author"]).keys()
tags_len = Counter(df["author"]).values()
tag_df = pd.DataFrame(zip(tags, tags_len), columns=["Class", "Count"])
tag_df.plot(x="Class", y="Count", kind="bar", legend=False, grid=False, figsize=(30, 10), cmap='viridis')
plt.title("Class / Count", fontsize=18)
plt.xlabel("Class", fontsize=10)
plt.ylabel("Count", fontsize=15)
plt.show()

In [None]:
smart_words = "/kaggle/input/turkish-news-article/stop-words.txt"
smart_stoplist = []
for line in open(smart_words, "r"):
    if line.strip()[0:1] != "#":
        for word in line.split():
            smart_stoplist.append(word)

In [None]:
df["text"] = [token.lower() for token in df["text"]]
#df["Text Cleaned"] = df["Text Cleaned"].apply(lambda x: re.sub('@[\w_]+', '', x))
#df["Text Cleaned"] = df["Text Cleaned"].apply(lambda x: re.sub('#[\w_]+', '', x))
#df['Text Cleaned'] = df['Text Cleaned'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
#df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: x.translate(x.maketrans('', '', string.punctuation)))
#df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: x.replace('"', '').replace("’", '').replace("'", '').replace("”", ''))
#df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: re.sub('\S*@\S*\s?', '', x))
#df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: emoji.replace_emoji(x))
#df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: re.sub('<.*?>', '', x))
#df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: re.sub('[0-9]+', '', x))
#df['Text Cleaned'] = df['Text Cleaned'].apply(lambda text: ' '.join([word for word in text.split() if word.lower() not in smart_stoplist]))
df["text"] = df["text"].apply(lambda x: re.sub("\n", "", x))

# N-Grams

In [None]:
def count_ngrams(corpus, ngram, n):
    vec = CountVectorizer(ngram_range=(ngram,ngram)).fit(corpus)
    bow = vec.transform(corpus).sum(axis=0)
    words_freq = sorted([(word, bow[0, idx]) for word, idx in vec.vocabulary_.items()], key=lambda x: x[1], reverse=True)[:n]
    return words_freq

In [None]:
def plot_ngrams(ngram_df, ngram_name):
    plt.figure(figsize=(12, 6))
    plt.bar(data=ngram_df, x="Text", height="Count")
    plt.xticks(rotation=90)
    plt.xlabel(ngram_name)
    plt.ylabel("Count")
    plt.title(ngram_name)
    plt.show()

In [None]:
unigrams = count_ngrams(df["text"], 1, 30)
top_unigram = pd.DataFrame(unigrams, columns=['Text', "Count"])
top_unigram.head()

In [None]:
plot_ngrams(top_unigram, "Unigrams")

In [None]:
bigrams = count_ngrams(df["text"], 2, 30)
top_bigram = pd.DataFrame(bigrams, columns=['Text', "Count"])
top_bigram.head()

In [None]:
plot_ngrams(top_bigram, "Bigrams")

In [None]:
trigrams = count_ngrams(df["text"], 3, 30)
top_trigram = pd.DataFrame(trigrams, columns=['Text', "Count"])
top_trigram.head()

In [None]:
plot_ngrams(top_trigram, "Trigrams")

# Feature Scaling

In [None]:
le = LabelEncoder()
df["author"] = le.fit_transform(df["author"])

In [None]:
authors = df["author"]
text = df["text"]

In [None]:
max_features = 20000
texts = text.tolist()
tokenizer = Tokenizer(num_words=max_features, filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', oov_token=True)
tokenizer.fit_on_texts(texts)

In [None]:
params = [sent_tokenize(text) for text in texts]

max_sentence_len = 15
max_sentence_num = 35
embed_size = 100

data = np.zeros((len(texts), max_sentence_num, max_sentence_len), dtype='int32')

for i, sentences in enumerate(params):
    for j, sent in enumerate(sentences[:max_sentence_num]):
        word_tokens = text_to_word_sequence(sent)
        word_indices = [tokenizer.word_index[word] for word in word_tokens if word in tokenizer.word_index and tokenizer.word_index[word] < max_features]
        data[i, j, :len(word_indices[:max_sentence_len])] = word_indices[:max_sentence_len]

In [None]:
labels = to_categorical(authors)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
embeddings_index = {}
f = open("/kaggle/input/glove-global-vectors-for-word-representation/glove.twitter.27B.100d.txt", "r", encoding="utf-8")
for line in tqdm.tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs

f.close()
print("Found %s word vectors." % len(embeddings_index))

In [None]:
input_dim = len(tokenizer.word_index) + 1

In [None]:
embedding_matrix = np.zeros((input_dim, 100))
for word, i in tqdm.tqdm(tokenizer.word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Model

In [None]:
class AttentionWithContext(layers.Layer):
    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1]),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight(shape=(input_shape[-1], 1),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def call(self, x, mask=None):
        uit = K.dot(x, self.W)

        if self.bias:
            uit += self.b
            
        uit = K.tanh(uit)
        ait = K.squeeze(K.dot(uit, self.u), -1)
        a = K.exp(ait)
        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [None]:
word_input = layers.Input(shape=(max_sentence_len,), dtype='float32')
x = layers.Embedding(len(tokenizer.word_index) + 1, 
                     embed_size, 
                     weights=[embedding_matrix], 
                     input_length=max_sentence_len, 
                     trainable=False)(word_input)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, kernel_regularizer="l2"))(x)
x = layers.TimeDistributed(layers.Dense(256, kernel_regularizer="l2"))(x)
x = AttentionWithContext()(x)
word_encoder = models.Model(word_input, x)

sentence_input = layers.Input(shape=(max_sentence_num, max_sentence_len), dtype='float32')
x = layers.TimeDistributed(word_encoder)(sentence_input)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, kernel_regularizer="l2"))(x)
x = layers.TimeDistributed(layers.Dense(256, kernel_regularizer="l2"))(x)
x = layers.Dropout(0.5)(AttentionWithContext()(x))
output_layer = layers.Dense(authors.nunique(), activation='softmax')(x)

model = models.Model(sentence_input, output_layer)
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [None]:
tf.keras.utils.plot_model(model, expand_nested=True, show_layer_names=True, show_shapes=True)

# Train

In [None]:
#early_stopping = EarlyStopping(monitor='val_accuracy', mode='max', patience=5)
#lr_scheduler = ReduceLROnPlateau(monitor='val_accuracy', mode='max', patience=5, factor=0.5)

In [None]:
history = model.fit(X_train, y_train, validation_split=0.1, epochs=100, batch_size=64)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.5f}")
print(f"Test Accuracy: {test_acc:.5f}")

# Results

In [None]:
history_df = pd.DataFrame(history.history)
history_df.head()

In [None]:
plt.figure()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["train", "valid"])
plt.title("Loss Curve")
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(["train", "valid"])
plt.title("Accuracy Curve")
plt.show()