In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#For EDA
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for general use throughout the notebook.
import random
import warnings
import time
%matplotlib inline
from sklearn.model_selection import train_test_split

# to see columns properly
pd.set_option('display.max_colwidth', None)

# for build our model
import tensorflow as tf
from tensorflow.keras.layers import Add, GlobalAvgPool1D, MaxPool1D, Activation, BatchNormalization, Embedding, LSTM, Dense, Bidirectional, Input, SpatialDropout1D, Dropout, Conv1D
from tensorflow.keras import Model
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.activations import relu

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

#!pip install datasets
from datasets import load_dataset

# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import data
### Our and Coltekin's dataset


In [None]:
train = pd.read_csv("../input/offensive-main/main/train.csv")
dev = pd.read_csv("../input/offensive-main/main/dev.csv")
test = pd.read_csv("../input/offensive-main/main/test.csv")

### Fix Headers

Retrieve the original dataset and fix the headers to make it more comprehensible.

In [None]:
original_dataset = load_dataset("offenseval2020_tr")
org_train = pd.DataFrame(original_dataset['train'])
org_train = org_train[:28000]
org_test = pd.DataFrame(original_dataset['test'])
org_train.rename(columns={'tweet': 'text', 'subtask_a': 'label'}, inplace=True)
org_test.rename(columns={'tweet': 'text', 'subtask_a': 'label'}, inplace=True)

### Our and Coltekin's normalized dataset

In [None]:
train = pd.read_csv("../input/last-normalized-data/normalized/normalized_train.csv")
dev = pd.read_csv("../input/last-normalized-data/normalized/normalized_dev.csv")
test = pd.read_csv("../input/last-normalized-data/normalized/normalized_test.csv")

In [None]:
org_train = pd.read_csv("../input/last-normalized-data/normalized_coltekin/normalized_coltekin_train.csv")
org_test = pd.read_csv("../input/last-normalized-data/normalized_coltekin/normalized_coltekin_test.csv")

# Cleaning Functions

We provide several cleaning functions:
* Removing URLs
* Removing HTML Tags
* Removing Usernames
* Removing Emojis
* fix_i -- it's basically used for a observed problem in the last dataset. There were lots of "i̇" character which can be considered as noise in Turkish text. We simply converted those to normal "i". (Most probably the problem is occured because of text retriever tool.)

In [None]:
import re
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

def remove_usernames(text):
    uh = re.compile(r'([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)')
    return uh.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return emoji_pattern.sub(r' ', text)

def fix_i(text):
    fix = re.compile(r'i̇')
    return fix.sub(r'i', text)
    


- You can investigate the impacts of preprocessing by simply removing the specific functions

In [None]:
# Applying helper functions to our sets
train['text'] = train['text'].apply(lambda x: remove_URL(x))
train['text'] = train['text'].apply(lambda x: remove_html(x))
train['text'] = train['text'].apply(lambda x: remove_usernames(x))
train['text'] = train['text'].apply(lambda x: remove_emoji(x))
train['text'] = train['text'].str.lower()
train['text'] = train['text'].apply(lambda x: fix_i(x))
dev['text'] = dev['text'].apply(lambda x: remove_URL(x))
dev['text'] = dev['text'].apply(lambda x: remove_html(x))
dev['text'] = dev['text'].apply(lambda x: remove_usernames(x))
dev['text'] = dev['text'].apply(lambda x: remove_emoji(x))
dev['text'] = dev['text'].str.lower()
dev['text'] = dev['text'].apply(lambda x: fix_i(x))
test['text'] = test['text'].apply(lambda x: remove_URL(x))
test['text'] = test['text'].apply(lambda x: remove_html(x))
test['text'] = test['text'].apply(lambda x: remove_usernames(x))
test['text'] = test['text'].apply(lambda x: remove_emoji(x))
test['text'] = test['text'].str.lower()
test['text'] = test['text'].apply(lambda x: fix_i(x))

# Applying helper functions to original sets
org_train['text'] = org_train['text'].apply(lambda x: remove_URL(x))
org_train['text'] = org_train['text'].apply(lambda x: remove_html(x))
org_train['text'] = org_train['text'].apply(lambda x: remove_usernames(x))
org_train['text'] = org_train['text'].apply(lambda x: remove_emoji(x))
org_train['text'] = org_train['text'].str.lower()
org_train['text'] = org_train['text'].apply(lambda x: fix_i(x))
org_test['text'] = org_test['text'].apply(lambda x: remove_URL(x))
org_test['text'] = org_test['text'].apply(lambda x: remove_html(x))
org_test['text'] = org_test['text'].apply(lambda x: remove_usernames(x))
org_test['text'] = org_test['text'].apply(lambda x: remove_emoji(x))
org_test['text'] = org_test['text'].str.lower()
org_test['text'] = org_test['text'].apply(lambda x: fix_i(x))

In [None]:
org_train[150:160]

# Visualization

In [None]:
# Displaying target distribution of Toxic.

fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(12, 4), dpi=70)
sns.countplot(train['label'], ax=axes[0])
axes[1].pie(train['label'].value_counts(),
            labels=["Not Toxic", "Toxic"],
            autopct='%1.2f%%',
            shadow=True,
            explode=(0.05, 0.05))
fig.suptitle('Distribution of the Tweets', fontsize=24)
plt.show()

# Training Phase

* You will encounter 2 different models by in totally same structured. They are used for investigating the differences between Coltekin's dataset results and ours at the same time. (You can also download the weights by a code line we provided below of the model.)

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
# hyperparameters
max_length = 200
batch_size = 128

In [None]:
model_name = "dbmdz/bert-base-turkish-128k-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
def bert_encode(data):
    tokens = tokenizer.batch_encode_plus(data, max_length=max_length, padding='max_length', truncation=True)
    
    return tf.constant(tokens['input_ids'])

In [None]:
train_encoded = bert_encode(train.text)
dev_encoded = bert_encode(dev.text)
org_train_encoded = bert_encode(org_train.text)

org_train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((org_train_encoded, org_train.label))
    .shuffle(64)
    .batch(batch_size)
)

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_encoded, train.label))
    .shuffle(64)
    .batch(batch_size)
)

dev_dataset = (
    tf.data.Dataset
   .from_tensor_slices((dev_encoded, dev.label))
   .shuffle(64)
   .batch(batch_size)
)

In [None]:
def model():
    
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    last_hidden_states = bert_encoder(input_word_ids)[0]
    x = SpatialDropout1D(0.2)(last_hidden_states)
    x = Conv1D(32, 3, activation='relu', padding='same')(x)
    x = Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2))(x)
    x = Dropout(0.1)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(input_word_ids, outputs)
    
    return model

In [None]:
with strategy.scope():
    model = model()
    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])

    model.summary()

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-5, patience=5)]

In [None]:
# Start train
history = model.fit(
    train_dataset,
    batch_size=batch_size,
    epochs=18,
    validation_data=dev_dataset,
    verbose=1,
    callbacks = callbacks)

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

- Save of load the model.

In [None]:
#model.save_weights(f'offensive_weights_new_train.h5')

In [None]:
#model.load_weights('../input/offensive-weights/offensive_weights.h5')

___

- Original training model (which will be used for training Coltekin's original dataset)

In [None]:
def org_model():
    
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    last_hidden_states = bert_encoder(input_word_ids)[0]
    x = SpatialDropout1D(0.2)(last_hidden_states)
    x = Conv1D(32, 3, activation='relu', padding='same')(x)
    x = Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2))(x)
    x = Dropout(0.1)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    org_model = Model(input_word_ids, outputs)
    
    return org_model

In [None]:
with strategy.scope():
    org_model = org_model()
    adam_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    org_model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])

    org_model.summary()

In [None]:
# Start train
history2 = org_model.fit(
    org_train_dataset,
    batch_size=batch_size,
    epochs=3,
    validation_data=dev_dataset,
    verbose=1,
    callbacks = callbacks)

In [None]:
def plot_graphs(history2, string):
    plt.plot(history2.history[string])
    plt.plot(history2.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
plot_graphs(history2, "accuracy")
plot_graphs(history2, "loss")

___

# Our Test Set

In [None]:
test_encoded = bert_encode(test.text)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_encoded)
    .batch(batch_size)
)

# Original test encoding


In [None]:
org_test_encoded = bert_encode(org_test.text)

org_test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(org_test_encoded)
    .batch(batch_size)
)

In [None]:
print(train.label.value_counts())
print(test.label.value_counts())
print(dev.label.value_counts())

# TEST
- We get the results of model that trained on our dataset.

In [None]:
pred = model.predict(test_dataset, batch_size=batch_size)
y_pred = tf.cast(tf.round(pred), tf.int32).numpy().flatten()

print('Precision: %.4f' % precision_score(test.label, y_pred))
print('Recall: %.4f' % recall_score(test.label, y_pred))
print('Accuracy: %.4f' % accuracy_score(test.label, y_pred))
print('F1 Score: %.4f' % f1_score(test.label, y_pred))
print(classification_report(test.label, y_pred))

In [None]:
pred = model.predict(org_test_dataset, batch_size=batch_size)
y_pred = tf.cast(tf.round(pred), tf.int32).numpy().flatten()

print('Precision: %.4f' % precision_score(org_test.label, y_pred))
print('Recall: %.4f' % recall_score(org_test.label, y_pred))
print('Accuracy: %.4f' % accuracy_score(org_test.label, y_pred))
print('F1 Score: %.4f' % f1_score(org_test.label, y_pred, average='macro'))
print(classification_report(org_test.label, y_pred))

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=16)
    plt.yticks(tick_marks, classes, fontsize=16)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=12)
    plt.xlabel('Predicted label', fontsize=12)

In [None]:
import itertools
cnf_matrix = confusion_matrix(test.label, y_pred)
plt.figure(figsize=(6,6))
plot_confusion_matrix(cnf_matrix, classes=org_train.label.unique(), title="Confusion matrix")
plt.show()

# Original trained model performance on our test set and original test set
- We get the results of model that trained on Coltekin's dataset.

In [None]:
pred = org_model.predict(test_dataset, batch_size=batch_size)
y_pred = tf.cast(tf.round(pred), tf.int32).numpy().flatten()

print('Precision: %.4f' % precision_score(test.label, y_pred, average='macro'))
print('Recall: %.4f' % recall_score(test.label, y_pred, average='macro'))
print('Accuracy: %.4f' % accuracy_score(test.label, y_pred))
print('F1 Score: %.4f' % f1_score(test.label, y_pred, average='macro'))
print(classification_report(test.label, y_pred))

In [None]:
pred = org_model.predict(org_test_dataset, batch_size=batch_size)
y_pred = tf.cast(tf.round(pred), tf.int32).numpy().flatten()

print('Precision: %.4f' % precision_score(org_test.label, y_pred))
print('Recall: %.4f' % recall_score(org_test.label, y_pred))
print('Accuracy: %.4f' % accuracy_score(org_test.label, y_pred))
print('F1 Score: %.4f' % f1_score(org_test.label, y_pred, average='macro'))
print(classification_report(org_test.label, y_pred))

# More about different training models

It was important to validate our model pipeline with both our dataset and Coltekin's dataset, and that is why we train 2 different models and test on both "our test set" and "Coltekin's test set".

The test results are tried on both average macro and default. You can simply change it to see differences between results which we explained in the paper.

Also, the results and reasons are described in more depth and detail in the paper.