In [None]:
import pandas as pd
import numpy as np
import datetime 

# Pre-Processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Modelling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GRU, Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Fine-Tuning and Tracking
import wandb

# Evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, matthews_corrcoef

# Hide Debug Info
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
df = pd.read_json('../Data/Processed/processed_binary.json')
df

In [None]:
# Initialise Weights and Bias
wandb.init(project="misinformation_nlp_BERT")

config = dict(
    model='RoBERTaO_LSTM',
    num_words = 25000,
    learning_rate = 0.0001,
    units = 128,
    input_len = 70,
    dropout = 0.5,
    batch_size = 256,
    epochs = 3,
    trainable = True,
    shuffle = True,
)
wandb.config.update(config)

In [None]:
class BertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, model_name='distilroberta-base', trainable=wandb.config.trainable, **kwargs):
        super(BertEmbeddingLayer, self).__init__(**kwargs)
        self.bert = TFRobertaModel.from_pretrained(model_name, trainable=trainable)

    def call(self, inputs):
        # Using the sequence output, not the pooled output
        return self.bert(inputs)[0]

In [None]:

from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras.models import Model
from transformers import TFRobertaModel

# Define input layers
input_ids_layer = Input(shape=(wandb.config.input_len,), dtype=tf.int32, name='input_ids')
attention_mask_layer = Input(shape=(wandb.config.input_len,), dtype=tf.int32, name='attention_mask')

roberta_model = TFRobertaModel.from_pretrained('distilroberta-base')

bert_output = BertEmbeddingLayer()([input_ids_layer, attention_mask_layer])
lstm_output = LSTM(units=wandb.config.units)(bert_output)
dropout_output = Dropout(wandb.config.dropout)(lstm_output)
output_layer = Dense(1, activation='sigmoid')(dropout_output)

model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output_layer)

model.compile(optimizer=Adam(learning_rate=wandb.config.learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=wandb.config.input_len, return_tensors="tf")

tokenized_texts = tokenize_function(df['tweet'].tolist())

# Extract input_ids and attention_masks from tokenized_texts
input_ids_np = tokenized_texts['input_ids'].numpy()
attention_masks_np = tokenized_texts['attention_mask'].numpy()
targets_np = df['target_binary'].to_numpy()

X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    input_ids_np, targets_np, test_size=0.25, random_state=14, stratify=targets_np
)
train_masks_np, test_masks_np, train_, test_ = train_test_split(
    attention_masks_np, targets_np, test_size=0.25, random_state=14, stratify=targets_np
)

# Convert the numpy arrays back to tensors if needed
X_train = tf.convert_to_tensor(X_train_np)
X_test = tf.convert_to_tensor(X_test_np)
y_train = tf.convert_to_tensor(y_train_np)
y_test = tf.convert_to_tensor(y_test_np)
train_masks = tf.convert_to_tensor(train_masks_np)
test_masks = tf.convert_to_tensor(test_masks_np)

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=2,           
    restore_best_weights=True
)

start_time = datetime.datetime.now()
history = model.fit({'input_ids': X_train, 'attention_mask': train_masks}, y_train, batch_size=wandb.config.batch_size, epochs=wandb.config.epochs, validation_split=0.15, shuffle=wandb.config.shuffle, callbacks=[early_stopping, wandb.keras.WandbCallback()])
end_time = datetime.datetime.now()

training_time = (end_time - start_time).total_seconds()

In [None]:
from sklearn.metrics import accuracy_score

# Get model predictions for both training and test sets
predictions_train = model.predict([X_train, train_masks])
predictions = model.predict([X_test, test_masks])

train_preds_binary = (predictions_train > 0.5).astype(int)
test_preds_binary = (predictions > 0.5).astype(int)

train_accuracy = accuracy_score(y_train, train_preds_binary)
test_accuracy = accuracy_score(y_test, test_preds_binary)

# Print out the accuracies
print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
auc = roc_auc_score(y_test, predictions)
mcc = matthews_corrcoef(y_test, test_preds_binary)

print("AUC:", auc)
print("MCC:", mcc)
print(" --- CLASSIFICATION REPORT --- " )
print(classification_report(y_test, test_preds_binary))

In [None]:
internal_val_df = pd.read_json('../Data/Processed/processed_binary_val.json')
external_val_df = pd.read_json('../Data/Cross_Validation/COVID_processed.json')

In [None]:
i_tokenized_texts = tokenize_function(internal_val_df['tweet'].tolist())
e_tokenized_texts = tokenize_function(external_val_df['tweet'].tolist())

In [None]:
i_predictions = model.predict({'input_ids': i_tokenized_texts['input_ids'], 'attention_mask': i_tokenized_texts['attention_mask']})

In [None]:
e_predictions = model.predict({'input_ids': e_tokenized_texts['input_ids'], 'attention_mask': e_tokenized_texts['attention_mask']})

In [None]:
i_predictions_binary = (i_predictions > 0.5).astype(int)
e_predictions_binary = (e_predictions > 0.5).astype(int)

In [None]:
i_val_mcc = matthews_corrcoef(internal_val_df['target_binary'], i_predictions_binary)
i_val_acc = accuracy_score(internal_val_df['target_binary'], i_predictions_binary)
print(f"(I) Validation: Matthews Correlation Coefficient: {i_val_mcc}")
print(f"(I) Validation: Accuracy: {i_val_acc}")
print("---")
e_val_mcc = matthews_corrcoef(external_val_df['target'], e_predictions_binary)
e_val_acc = accuracy_score(external_val_df['target'], e_predictions_binary)
print(f"(I) Validation: Matthews Correlation Coefficient: {e_val_mcc}")
print(f"(E) Validation Accuracy: {e_val_acc}")

In [None]:
total_val_mcc = i_val_mcc + e_val_mcc

In [None]:
wandb.log({'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy, 'AUC': auc, 'MCC': mcc, 'Training Time': training_time, '(Internal) Validation MCC': i_val_mcc, '(Internal) Validation ACC': i_val_acc, '(External) Validation MCC': e_val_mcc, '(External) Validation Accuracy': e_val_acc, 'Total Validation MCC': total_val_mcc})

In [None]:
wandb.finish()

In [None]:
import plotly.express as px

# Assuming `predictions` are your model's output probabilities for the positive class
fig = px.histogram(predictions, nbins=50, labels={'value': 'Prediction Confidence'})
fig.update_layout(title='Distribution of Prediction Confidence')
fig.show()
