In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import AutoTokenizer, TFRobertaModel
import random

In [None]:
data = 'final_cleaned2.csv'

In [None]:
def shuffle_string(string):
    words = string.split()
    random.shuffle(words)
    return ' '.join(words)

In [None]:
from keras.utils import to_categorical
data = pd.read_csv(data, lineterminator='\n')
data = data[data['text2'].notna()]
peaceful = ['CA', 'FI', 'GB', 'NO', 'IE', 'AU', 'SG', 'FR']
data['peacefulness'] = data.country_code.apply(lambda x: 1 if (x in peaceful) else 0)
data['text3'] = data['text2'].apply(shuffle_string)
x = data['text2'].to_numpy()
y = data['peacefulness'].to_numpy()
y = to_categorical(data['peacefulness'])
x_train, x_val, y_train, y_val = train_test_split(x, y, random_state = 123, test_size = 0.2)

sx = data['text3'].to_numpy()
sx_train, sx_val, sy_train, sy_val = train_test_split(sx, y, random_state = 123, test_size = 0.2)

# Neural Network Models

In [None]:
# source: https://www.kaggle.com/code/tylerrosacker/bertweet-transfer-learning
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", 
                                          normalization=True, 
                                          use_fast = False,
                                          add_special_tokens=True,
                                          pad_to_max_length=True,
                                          return_attention_mask=True)

train_token = tokenizer(x_train.tolist(), 
                        padding="max_length", 
                        truncation=True,
                        return_tensors = 'tf').data

val_token = tokenizer(x_val.tolist(), 
                      padding="max_length", 
                      truncation=True,
                      return_tensors = 'tf').data

train_features = {x: train_token[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, y_train))
train_tf_dataset = train_tf_dataset.shuffle(len(x_train)).batch(32).prefetch(tf.data.AUTOTUNE)

val_features = {x: val_token[x] for x in tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, y_val))
val_tf_dataset = val_tf_dataset.shuffle(len(x_val)).batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base', 
                                          normalization=True, 
                                          use_fast = False,
                                          add_special_tokens=True,
                                          pad_to_max_length=True,
                                          return_attention_mask=True)

strain_token = tokenizer(sx_train.tolist(), 
                         padding="max_length", 
                         truncation=True,
                         return_tensors = 'tf').data

sval_token = tokenizer(sx_val.tolist(), 
                       padding="max_length", 
                       truncation=True,
                       return_tensors = 'tf').data

strain_features = {x: strain_token[x] for x in tokenizer.model_input_names}
strain_tf_dataset = tf.data.Dataset.from_tensor_slices((strain_features, sy_train))
strain_tf_dataset = strain_tf_dataset.shuffle(len(sx_train)).batch(32).prefetch(tf.data.AUTOTUNE)

sval_features = {x: sval_token[x] for x in tokenizer.model_input_names}
sval_tf_dataset = tf.data.Dataset.from_tensor_slices((sval_features, sy_val))
sval_tf_dataset = sval_tf_dataset.shuffle(len(sx_val)).batch(32).prefetch(tf.data.AUTOTUNE)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from transformers import TFAutoModelForSequenceClassification, RobertaConfig
config = RobertaConfig.from_pretrained('vinai/bertweet-base', num_labels = 2, dropout = 0.2)
tf_model = TFAutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base', config = config, trainable=True)

In [None]:
# source: https://github.com/wz2536/power-of-peace-speech_CapstoneFall2021/blob/main/Classification%20Models/fine-tune-roberta.ipynb
def build_model(lr = 1e-5):   
    input_ids = tf.keras.Input(shape=(128,),dtype='int32', name = 'input_ids')
    attention_masks = tf.keras.Input(shape=(128,), dtype='int32', name = 'attention_mask')

    output = tf_model([input_ids, attention_masks])[0]
    output = tf.keras.layers.Activation(activation='softmax')(output)

    model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = lr, clipnorm=1.),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
        
    return model

# For Unshuffled Tweet

In [None]:
model = build_model()
history = model.fit(train_tf_dataset, epochs = 1)

In [None]:
model.evaluate(val_tf_dataset)

In [None]:
yhat = model.predict(val_token)
pred_labels = np.argmax(yhat, axis=1)
y_true = np.argmax(y_val, axis=1)

In [None]:
tf.math.confusion_matrix(
    y_true,
    pred_labels
)

# For Shuffled Tweet

In [None]:
smodel = build_model()
shistory = smodel.fit(strain_tf_dataset, epochs = 1)

In [None]:
smodel.evaluate(sval_tf_dataset)

In [None]:
syhat = smodel.predict(sval_token)
spred_labels = np.argmax(syhat, axis=1)
sy_true = np.argmax(sy_val, axis=1)

In [None]:
tf.math.confusion_matrix(
    sy_true,
    spred_labels
)

# LIME

In [None]:
def predict_probs(texts):
    text_token = tokenizer(texts, 
                           padding="max_length", 
                           truncation=True,
                           return_tensors = 'tf').data
    predictions = model.predict(text_token)
    return predictions

In [None]:
from lime import lime_text
from sklearn.pipeline import make_pipeline

class_names = ['low peace','high peace']
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

i = 0
STR = str(x_val[i])
exp = explainer.explain_instance(STR, predict_probs)

In [None]:
exp.show_in_notebook(text=False)

# SHAP

In [None]:
import shap

In [None]:
def predict_probs2(texts):
    texts = [str(text) for text in texts]
    text_token = tokenizer(texts, 
                           padding="max_length", 
                           truncation=True,
                           return_tensors = 'tf').data
    predictions = model.predict(text_token)
    return predictions

In [None]:
class_names = ['low peace','high peace']
explainer = shap.Explainer(predict_probs2, tokenizer, output_names=class_names)

In [None]:
shap_values = explainer(x_val[:5].tolist(), fixed_context=1)

In [None]:
shap.plots.text(shap_values[:5])