In [None]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer

# Modelling
import tensorflow as tf
from tensorflow.keras import preprocessing as kprocessing
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.optimizers import Adam

# Get PLLM
import transformers
from transformers import DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaModel, TFRobertaModel, RobertaConfig

# Fine-Tuning and Tracking
import wandb

# Evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, matthews_corrcoef

# Visualization
import matplotlib
from matplotlib import pyplot as plt

# Hide Debug Info
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
df = pd.read_json('../Data/Processed/processed_binary.json')
df

In [None]:
tkzr = RobertaTokenizer.from_pretrained('distilroberta-base')

In [None]:
# token_lengths = []
# for text in df['tweet']:
#     tokens = tokenizer_bert.encode(text, add_special_tokens=True)
#     token_lengths.append(len(tokens))

# token_lengths = np.array(token_lengths)

# percentiles = [50, 75, 90, 95, 99]
# for p in percentiles:
#     print(f"{p}th percentile of token length: {np.percentile(token_lengths, p)}")

In [None]:
wandb.init(project="misinformation_glove_nlp_BERT")
config = dict(
    epochs=2,
    batch_size=256,
    max_len=70,  
    lr = 0.005,
    model = 'DistillBERT',
    label='2'
)
wandb.config.update(config)

In [None]:
X = df['tweet']
y = df['target_binary']

In [None]:
X = [tkzr(tweet, padding='max_length', max_length=wandb.config.max_len, truncation=True)['input_ids'] for tweet in X]
X = np.array(X, dtype='int32')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=14, stratify=df['target_binary'])

In [None]:
print('Shape of training data: ', X_train.shape)

In [None]:
config = RobertaConfig.from_pretrained('distilroberta-base', hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2)

In [None]:
dbert = TFRobertaModel.from_pretrained('distilroberta-base', config=config)

In [None]:
input_ids_in = layers.Input(shape=(wandb.config.max_len,), dtype='int32')

# bert_output = dbert(input_ids=input_ids_in)[0][:,0,:]
# dropout_output = layers.Dropout(0.5)(bert_output)
# x = layers.Dense(1, activation='sigmoid')(dropout_output)

bert_output = dbert(input_ids=input_ids_in)[0][:,0,:]
x = layers.Dense(1, activation='sigmoid')(bert_output)

dbert_model = models.Model(inputs=input_ids_in, outputs = x)

dbert_model.compile(optimizer=Adam(learning_rate=wandb.config.lr), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
import datetime as dt

start_time = dt.datetime.now()
history = dbert_model.fit(X_train, y_train, batch_size=wandb.config.batch_size, epochs=wandb.config.epochs, shuffle=True, validation_data=(X_test, y_test), callbacks=[wandb.keras.WandbCallback()])
end_time = dt.datetime.now()

training_time_bert = (end_time - start_time).total_seconds()

In [None]:
predictions_train = dbert_model.predict(X_train)
predictions_test = dbert_model.predict(X_test)

In [None]:
# Convert predictions to binary (0 or 1) based on a threshold of 0.5
train_preds_binary = (predictions_train > 0.5).astype(int)
test_preds_binary = (predictions_test > 0.5).astype(int)

train_accuracy = accuracy_score(y_train, train_preds_binary)
test_accuracy = accuracy_score(y_test, test_preds_binary)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
auc = roc_auc_score(y_test, predictions_test)
mcc = matthews_corrcoef(y_test, test_preds_binary)

print("AUC:", auc)
print("MCC:", mcc)
print(" --- CLASSIFICATION REPORT --- " )
print(classification_report(y_test, test_preds_binary))

In [None]:
internal_val_df = pd.read_json('../Data/Processed/processed_binary_val.json')
external_val_df = pd.read_json('../Data/Cross_Validation/COVID_processed.json')

In [None]:
i_X = internal_val_df['tweet']
i_y = internal_val_df['target_binary']

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

i_X = [tokenizer(tweet, padding='max_length', max_length=wandb.config.max_len, truncation=True)['input_ids'] for tweet in i_X]
i_X = np.array(i_X, dtype='int32')

In [None]:
i_predictions = dbert_model.predict(i_X)

In [None]:
e_X = external_val_df['tweet']
e_y = external_val_df['target']

In [None]:
e_X = [tokenizer(tweet, padding='max_length', max_length=wandb.config.max_len, truncation=True)['input_ids'] for tweet in e_X]
e_X = np.array(e_X, dtype='int32')

In [None]:
e_predictions = dbert_model.predict(e_X)

In [None]:
i_predictions_binary = (i_predictions > 0.5).astype(int)
e_predictions_binary = (e_predictions > 0.5).astype(int)

In [None]:
i_val_mcc = matthews_corrcoef(internal_val_df['target_binary'], i_predictions_binary)
i_val_acc = accuracy_score(internal_val_df['target_binary'], i_predictions_binary)
print(f"(I) Validation: Matthews Correlation Coefficient: {i_val_mcc}")
print(f"(I) Validation: Accuracy: {i_val_acc}")
print("---")
e_val_mcc = matthews_corrcoef(external_val_df['target'], e_predictions_binary)
e_val_acc = accuracy_score(external_val_df['target'], e_predictions_binary)
print(f"(E) Validation: Matthews Correlation Coefficient: {e_val_mcc}")
print(f"(E) Validation Accuracy: {e_val_acc}")

In [None]:
total_val_mcc = i_val_mcc + e_val_mcc

In [None]:
wandb.log({'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy, 'AUC': auc, 'MCC': mcc, 'Training Time': training_time_bert, '(Internal) Validation MCC': i_val_mcc, '(Internal) Validation ACC': i_val_acc, '(External) Validation MCC': e_val_mcc, '(External) Validation Accuracy': e_val_acc, 'Total Validation MCC': total_val_mcc})
wandb.finish()

In [None]:
total_val_mcc