In [None]:
import pandas as pd
import numpy as np
import datetime 

# Pre-Processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Modelling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GRU, Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Fine-Tuning and Tracking
import wandb

# Evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, matthews_corrcoef

# Hide Debug Info
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
df = pd.read_json('../Data/Processed/processed_binary.json')
df

In [None]:
# Initialise Weights and Bias
wandb.init(project="misinformation_nlp")

config = dict(
    model='BiLSTM+DENSE',
    embedding_dim = 200,
    num_words = 20000,
    learning_rate = 0.0001,
    units = 52,
    input_len = 56,
    trainable = False,
    dropout = 0.5,
    batch_size = 256,
    epochs = 20,
    shuffle = True
)
wandb.config.update(config)

In [None]:
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_file = '../Embeddings/glove.twitter.27B.' + str(wandb.config.embedding_dim) + 'd.txt'
embeddings_index = load_glove_embeddings(glove_file)

In [None]:
texts = df['tweet'].astype(str)

tokenizer = Tokenizer(num_words=wandb.config.num_words, oov_token="<pad>", lower=False) 
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

max_len = max(len(seq) for seq in sequences)

print(max_len)

In [None]:
data = pad_sequences(sequences, padding='post', maxlen=wandb.config.input_len) 

In [None]:
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, wandb.config.embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=wandb.config.embedding_dim, input_length=wandb.config.input_len, trainable=wandb.config.trainable))
# model.add(LSTM(units=wandb.config.units))
# model.add(Dropout(wandb.config.dropout))
# model.add(Dense(1, activation='sigmoid'))

# optimiser = Adam(learning_rate=wandb.config.learning_rate)

In [None]:
# model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=wandb.config.embedding_dim, input_length=wandb.config.input_len, trainable=wandb.config.trainable))
# model.add(Bidirectional(LSTM(units=wandb.config.units)))
# model.add(Dropout(wandb.config.dropout))
# model.add(Dense(1, activation='sigmoid'))

# optimiser = Adam(learning_rate=wandb.config.learning_rate)

In [None]:
# model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=wandb.config.embedding_dim, input_length=wandb.config.input_len, trainable=wandb.config.trainable))
# model.add(GRU(units=wandb.config.units))
# model.add(Dropout(wandb.config.dropout))
# model.add(Dense(1, activation='sigmoid'))

# optimiser = Adam(learning_rate=wandb.config.learning_rate)

In [None]:
# model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=wandb.config.embedding_dim, input_length=wandb.config.input_len, trainable=wandb.config.trainable))
# model.add(Bidirectional(GRU(units=wandb.config.units)))
# model.add(Dropout(wandb.config.dropout))
# model.add(Dense(1, activation='sigmoid'))

# optimiser = Adam(learning_rate=wandb.config.learning_rate)

In [None]:
# from keras.models import Sequential
# from keras.layers import Embedding, Conv1D, LSTM, Bidirectional, Dropout, Dense
# from keras.optimizers import Adam

# model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=wandb.config.embedding_dim, input_length=wandb.config.input_len, trainable=wandb.config.trainable))
# model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
# model.add(LSTM(units=wandb.config.units))
# model.add(Dropout(wandb.config.dropout))
# model.add(Dense(1, activation='sigmoid'))

# optimiser = Adam(learning_rate=wandb.config.learning_rate)

In [None]:
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K

class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)
 
    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)
 
    def call(self,x):
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)   
        # Compute the weights
        alpha = K.softmax(e)
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=wandb.config.embedding_dim, input_length=wandb.config.input_len, trainable=wandb.config.trainable))
model.add(Bidirectional(LSTM(units=wandb.config.units)))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(wandb.config.dropout))
model.add(Dense(32, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

optimiser = Adam(learning_rate=wandb.config.learning_rate)

In [None]:
model.compile(optimizer=optimiser,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, df['target_binary'], test_size=0.25, random_state=14)

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=5,           
    restore_best_weights=True
)

start_time = datetime.datetime.now()
history = model.fit(X_train, y_train, batch_size=wandb.config.batch_size, epochs=wandb.config.epochs, validation_split=0.15, shuffle=wandb.config.shuffle, callbacks=[early_stopping, wandb.keras.WandbCallback()])
end_time = datetime.datetime.now()

training_time = (end_time - start_time).total_seconds()

In [None]:
predictions_train = model.predict(X_train)
predictions = model.predict(X_test)

In [None]:
train_preds_binary = (predictions_train > 0.5).astype(int)
train_acc = accuracy_score(y_train, train_preds_binary)
train_acc

In [None]:
test_preds_binary = (predictions > 0.5).astype(int)
test_acc = accuracy_score(y_test, test_preds_binary)
test_acc

In [None]:
auc = roc_auc_score(y_test, predictions)
mcc = matthews_corrcoef(y_test, test_preds_binary)

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)
print("AUC:", auc)
print("MCC:", mcc)
print(" --- CLASSIFICATION REPORT --- " )
print(classification_report(y_test, test_preds_binary))

In [None]:
internal_val_df = pd.read_json('../Data/Processed/processed_binary_val.json')
external_val_df = pd.read_json('../Data/Cross_Validation/COVID_processed.json')

In [None]:
i_val_texts = internal_val_df['tweet'].astype(str)
e_val_texts = external_val_df['tweet'].astype(str)

tokenizer = Tokenizer(num_words=wandb.config.num_words, oov_token="<pad>") 

tokenizer.fit_on_texts(i_val_texts)
tokenizer.fit_on_texts(e_val_texts)

i_val_sequences = tokenizer.texts_to_sequences(i_val_texts)
e_val_sequences = tokenizer.texts_to_sequences(e_val_texts)

i_val_padded = pad_sequences(i_val_sequences, maxlen=wandb.config.input_len) 
e_val_padded = pad_sequences(e_val_sequences, maxlen=wandb.config.input_len) 

In [None]:
i_val_predictions = model.predict(i_val_padded)
e_val_predictions = model.predict(e_val_padded)

# Convert probabilities to binary
i_val_predictions_binary = (i_val_predictions > 0.5).astype(int)
e_val_predictions_binary = (e_val_predictions > 0.5).astype(int)

In [None]:
i_val_mcc = matthews_corrcoef(internal_val_df['target_binary'], i_val_predictions_binary)
i_val_acc = accuracy_score(internal_val_df['target_binary'], i_val_predictions_binary)
print(f"(I) Validation: Matthews Correlation Coefficient: {i_val_mcc}")
print(f"(I) Validation: Accuracy: {i_val_acc}")
print("---")
e_val_mcc = matthews_corrcoef(external_val_df['target'], e_val_predictions_binary)
e_val_acc = accuracy_score(external_val_df['target'], e_val_predictions_binary)
print(f"(I) Validation: Matthews Correlation Coefficient: {e_val_mcc}")
print(f"(E) Validation Accuracy: {e_val_acc}")

In [None]:
total_val_mcc = i_val_mcc + e_val_mcc

In [None]:
wandb.log({'Train Accuracy': train_acc, 'Test Accuracy': test_acc, 'AUC': auc, 'MCC': mcc, 'Training Time': training_time, '(Internal) Validation MCC': i_val_mcc, '(Internal) Validation ACC': i_val_acc, '(External) Validation MCC': e_val_mcc, '(External) Validation Accuracy': e_val_acc, 'Total Validation MCC': total_val_mcc})

In [None]:
wandb.finish()