# Identifying Misinformation in Social Media and News Sources

This notebook creates a model that can detect misinformation within social media and news outlets. We utilize the WatClaimCheck paper as our initial starting point with slight variation. We highlight how our process works below.

TODO:
- Add high level overview of how notebook is structured
- Add diagrams on how models are used

In [None]:
# Allow access to parent directory
import sys
import os
parent_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(1, parent_path)

from helper import download_dataset, download_article

# DPR
from transformers import DPRContextEncoderTokenizer, TFDPRContextEncoder
from transformers import DPRQuestionEncoderTokenizer, TFDPRQuestionEncoder

# RoBERTa
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## Read and Preprocess Data

In [None]:
# Set Global Variables
DATASET_FP = "../WatClaimCheck_dataset" # CHANGE TO MATCH LOCAL

In [None]:
# Retrieve dataset
train_df, valid_df, test_df = download_dataset(DATASET_FP)
print(train_df.count())
print(valid_df.count())
print(test_df.count())
# print(f"Train Row Count: {len(train_df)}")
# print(f"Valid Row Count: {len(valid_df)}")
# print(f"Test Row Count:  {len(test_df)}")

In [None]:
# Get review article content
train_df['review_article_content'] = train_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x)))
valid_df['review_article_content'] = valid_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x)))
test_df['review_article_content'] = test_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x)))


In [None]:
# Add question
train_df['question'] = train_df['claim'].apply(lambda x: f"Is the claim \"{x}\" true, false, or partially true/false?")
valid_df['question'] = valid_df['claim'].apply(lambda x: f"Is the claim \"{x}\" true, false, or partially true/false?")
test_df['question'] = test_df['claim'].apply(lambda x: f"Is the claim \"{x}\" true, false, or partially true/false?")

In [None]:
idx = 0
print(train_df['claim'][idx])
print(train_df['rating'][idx], train_df['original_rating'][idx])
print(train_df['premise_articles'][idx])
print(train_df['reviewer_site'][idx])
print()
print(train_df['review_article_content'][idx])

## DPR Model

In [None]:
# Import libraries
import tensorflow as tf
from datasets import load_dataset
from transformers import TFAutoModel, AutoTokenizer

# Load dataset
dataset = load_dataset("my_dataset") # Replace with your dataset name
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

# Load pretrained DPR model and tokenizer
model_name = "facebook/dpr-question_encoder-single-nq-base" # Replace with your model name
model = TFAutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define input and output features
input_features = ["question", "passages"]
output_features = ["relevance"]

# Tokenize and encode the dataset
def encode(examples):
  inputs = tokenizer(examples["question"], examples["passages"], padding="max_length", truncation=True, return_tensors="tf")
  outputs = tf.convert_to_tensor(examples["relevance"], dtype=tf.int32)
  return inputs, outputs

train_dataset = train_dataset.map(encode, batched=True)
val_dataset = val_dataset.map(encode, batched=True)

# Create a Keras model that wraps the DPR model
class DPRModel(tf.keras.Model):
  def __init__(self, model):
    super().__init__()
    self.model = model
  
  def call(self, inputs):
    # Get the question and passage embeddings from the DPR model
    question_embeddings = self.model.question_encoder(inputs["input_ids"][:,0,:], attention_mask=inputs["attention_mask"][:,0,:]).pooler_output
    passage_embeddings = self.model.ctx_encoder(inputs["input_ids"][:,1:,:], attention_mask=inputs["attention_mask"][:,1:,:]).pooler_output
    
    # Compute the dot product similarity between question and passage embeddings
    similarity_scores = tf.einsum("nd,npd->np", question_embeddings, passage_embeddings)
    
    # Return the similarity scores as logits
    return similarity_scores

# Instantiate the Keras model
keras_model = DPRModel(model)

# Define the loss function and metrics
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# Compile the Keras model
keras_model.compile(optimizer="adam", loss=loss, metrics=[accuracy])

# Fit the Keras model on the dataset
keras_model.fit(train_dataset, validation_data=val_dataset, epochs=3) # Adjust the number of epochs as needed

In [None]:
dpr_context_encoder_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
dpr_context_encoder_model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

dpr_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
dpr_question_encoder_model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

In [None]:
tokens = dpr_context_encoder_tokenizer(
    train_df['review_article_content'].to_list(),
    return_tensors = 'tf'
)

In [None]:
dpr_context_encoder_model(tokens).pooler_output

## RoBERTa Model

In [None]:
claim_max_seq_len = 128
sentence_max_Seq_len = 320
num_train_examples = 26976 # 26976
num_valid_examples = 3372 # 3372
checkpoint = 'distilroberta-base'

In [None]:
roberta_tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
roberta_model = TFRobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
encoder = OneHotEncoder(sparse_output=False)

train_inputs = roberta_tokenizer(
    train_df['claim'][:num_train_examples].to_list(),
    max_length=claim_max_seq_len,
    truncation=True,
    padding='max_length',
    return_tensors='tf'
)

train_labels = encoder.fit_transform(np.array(train_df['rating']).reshape(-1, 1))

valid_inputs = roberta_tokenizer(
    valid_df['claim'][:num_valid_examples].to_list(),
    max_length=claim_max_seq_len,
    truncation=True,
    padding='max_length',
    return_tensors='tf'
)

valid_labels = encoder.fit_transform(np.array(valid_df['rating']).reshape(-1, 1))

In [None]:
hidden_size = 100
learning_rate = 1e-5

input_ids = tf.keras.layers.Input(shape=(claim_max_seq_len,), dtype=tf.int64, name='input_ids_layer')
attention_mask = tf.keras.layers.Input(shape=(claim_max_seq_len,), dtype=tf.int64, name='attention_mask_layer')

roberta_inputs = {'input_ids': input_ids,
                   'attention_mask': attention_mask}

roberta_model.trainable = True
roberta_outputs = roberta_model(roberta_inputs)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=roberta_outputs)

model.compile(
    optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.F1Score(average='macro')
    ]
)


In [None]:
model.summary()

In [None]:
keras.utils.plot_model(model, show_shapes=True, dpi=90)

In [None]:
batch_size = 12
epochs = 10
checkpoint_path = "training/cp.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

model_history = model.fit(
    [train_inputs.input_ids, train_inputs.attention_mask], train_labels,
    validation_data=([valid_inputs.input_ids, valid_inputs.attention_mask], valid_labels),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[cp_callback]
)

In [None]:
# Create subplots
fig, axes = plt.subplots(5, 1, figsize=(7, 15))
epochs = range(1, len(model_history.history['loss']) + 1)

axes[0].plot(epochs, model_history.history["loss"], 'o-', label='Training loss')
axes[0].plot(epochs, model_history.history["val_loss"], 'o-', label='Validation loss')
axes[0].set_title('Training and Validation Loss')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('Loss')
axes[0].legend()

axes[1].plot(epochs, model_history.history['categorical_accuracy'], 'o-', label='Training accuracy')
axes[1].plot(epochs, model_history.history['val_categorical_accuracy'], 'o-', label='Validation accuracy')
axes[1].set_title('Training and Validation Categorical Accuracy')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Accuracy')
axes[1].legend()

axes[2].plot(epochs, model_history.history["precision"], 'o-', label='Training precision')
axes[2].plot(epochs, model_history.history["val_precision"], 'o-', label='Validation precision')
axes[2].set_title('Training and Validation Precision')
axes[2].set_xlabel('Epochs')
axes[2].set_ylabel('Precision')
axes[2].legend()

axes[3].plot(epochs, model_history.history["recall"], 'o-', label='Training recall')
axes[3].plot(epochs, model_history.history["val_recall"], 'o-', label='Validation recall')
axes[3].set_title('Training and Validation Recall')
axes[3].set_xlabel('Epochs')
axes[3].set_ylabel('Recall')
axes[3].legend()

axes[4].plot(epochs, model_history.history["f1_score"], 'o-', label='Training Macro F1 Score')
axes[4].plot(epochs, model_history.history["val_f1_score"], 'o-', label='Validation Macro F1 Score')
axes[4].set_title('Training and Validation F1 Score')
axes[4].set_xlabel('Epochs')
axes[4].set_ylabel('F1 Score')
axes[4].legend()

plt.tight_layout()
plt.show()