<a href="https://colab.research.google.com/github/sysung/w266-final-project/blob/master/MisInformation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Identifying Misinformation in Social Media and New Sources

Download the dataset and unzip dataset

In [2]:
!curl -O https://cs.uwaterloo.ca/~ppoupart/fact-check/WatClaimCheck.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1107M  100 1107M    0     0  15.1M      0  0:01:12  0:01:12 --:--:-- 16.4M


In [3]:
!tar -xzf WatClaimCheck.tar.gz

Install necessary packages

In [4]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
Co

Import libraries

In [5]:
import sys
import os

# DPR
from transformers import DPRContextEncoderTokenizerFast, TFDPRContextEncoder
from transformers import DPRQuestionEncoderTokenizerFast, TFDPRQuestionEncoder
from transformers import DPRReaderTokenizerFast, TFDPRReader

# RoBERTa
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

import json
import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


Helper Functions

In [6]:
def explode_dictionary(pd_df: pd.DataFrame, field: str) -> pd.DataFrame:
    '''Explodes a dictionary within a column as multiple columns and then drops the parent field'''
    return pd.concat([pd_df.drop(field, axis=1), pd_df[field].apply(pd.Series)], axis=1)


def clean_pd_df(pd_df: pd.DataFrame) -> pd.DataFrame:
    '''Extracts field from dataframe, casts review_date to datetime and removes id'''

    print("Extracting fields from metadata")
    pd_df = explode_dictionary(pd_df, 'metadata')

    print("Extracting fields from label")
    pd_df = explode_dictionary(pd_df, 'label')

    # Set claim_date as review_date if review_date does not exist
    pd_df['review_date'].fillna(pd_df['claim_date'], inplace=True)

    # Convert review_date to date time
    pd_df['review_date'] = pd.to_datetime(pd_df['review_date'].str.split('T', expand=True)[0])

    # Drop ID
    pd_df = pd_df.drop(columns=['id'])

    return pd_df


def download_dataset(dataset_fp: str) -> tuple:
    '''
    Reads the WatClaimCheck dataset from the filepath and returns a pandas dataframe of the train, valid, and test datasets
    Reads the WatClaimCheck dataset from the filepath and returns a pandas dataframe of the train, valid, and test datasets

    Parameters:
    dataset_fp (str): Filepath of dataset

    Returns:
    tuple: Train, Valid, Test Pandas Dataframes
    '''

    # Get full path of json files
    train_json_fp = os.path.join(dataset_fp, 'train.json')
    valid_json_fp = os.path.join(dataset_fp, 'valid.json')
    test_json_fp = os.path.join(dataset_fp, 'test.json')

    # Get pandas dataframe from json
    train_pd_df = pd.read_json(train_json_fp)
    valid_pd_df = pd.read_json(valid_json_fp)
    test_pd_df = pd.read_json(test_json_fp)

    # Get all of the data corresponding to the metadata and labels
    clean_train_pd_df = clean_pd_df(train_pd_df)
    clean_valid_pd_df = clean_pd_df(valid_pd_df)
    clean_test_pd_df = clean_pd_df(test_pd_df)

    return clean_train_pd_df, clean_valid_pd_df, clean_test_pd_df

def download_article(dataset_fp: str, article_file: str) -> dict:
    '''
    Downloads an article from the WatClaimCheck Dataset

    Parameters:
    dataset_fp   (str): Filepath of dataset
    article_file (str): Filepath of article

    Returns:
    dict: Article JSON
    '''

    # Read articles as json
    full_article_fp = open(os.path.join(dataset_fp, 'articles', article_file))
    json_data = json.load(full_article_fp)
    full_article_fp.close()

    return json_data

## Read and Preprocess Data

In [7]:
DATASET_FP = "./WatClaimCheck_dataset"

In [8]:
# Retrieve dataset
train_df, valid_df, test_df = download_dataset(DATASET_FP)
print(train_df.count())
print(valid_df.count())
print(test_df.count())

Extracting fields from metadata
Extracting fields from label
Extracting fields from metadata
Extracting fields from label
Extracting fields from metadata
Extracting fields from label
claimant            19751
claim               26976
claim_date          19190
review_date         26976
premise_articles    26976
reviewer_name       26976
reviewer_site       26976
review_url          26976
rating              26976
original_rating     26976
review_article      26976
dtype: int64
claimant            2476
claim               3372
claim_date          2407
review_date         3372
premise_articles    3372
reviewer_name       3372
reviewer_site       3372
review_url          3372
rating              3372
original_rating     3372
review_article      3372
dtype: int64
claimant            2450
claim               3373
claim_date          2384
review_date         3373
premise_articles    3373
reviewer_name       3373
reviewer_site       3373
review_url          3373
rating              3373
origi

In [None]:
# Get review article content
train_df['review_article_content'] = train_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x))[1000:-1000])
valid_df['review_article_content'] = valid_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x))[1000:-1000])
test_df['review_article_content'] = test_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x))[1000:-1000])

In [None]:
# Add question
train_df['question'] = train_df['claim'].apply(lambda x: f"Is the claim \"{x}\" true, false, or partially true/false?")
valid_df['question'] = valid_df['claim'].apply(lambda x: f"Is the claim \"{x}\" true, false, or partially true/false?")
test_df['question'] = test_df['claim'].apply(lambda x: f"Is the claim \"{x}\" true, false, or partially true/false?")

## DPR Model

In [None]:
dpr_context_encoder_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
dpr_context_encoder_model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

dpr_question_encoder_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
dpr_question_encoder_model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

dpr_reader_tokenizer = DPRReaderTokenizerFast.from_pretrained('facebook/dpr-reader-single-nq-base')
dpr_reader_model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')

In [1]:
train_length = (len(train_df) * 0.5)
print(train_length)

NameError: ignored

In [28]:
train_length = (len(train_df) * 0.5)
dpr_context_msl = 256
dpr_question_msl = 128

print("Tokenizing Context...")
dpr_context_tokens = dpr_context_encoder_tokenizer(
    train_df['review_article_content'].to_list(),
    max_length = dpr_context_msl,
    padding = 'max_length',
    truncation = True,
    return_tensors = 'tf'
)

print("Tokenizing Questions...")
dpr_question_tokens = dpr_question_encoder_tokenizer(
    train_df['question'].to_list(),
    max_length = dpr_question_msl,
    padding = 'max_length',
    truncation = True,
    return_tensors = 'tf'
)

print("Retreiving Context Pooler Output")
context_vector = dpr_context_encoder_model(dpr_context_tokens).pooler_output

print("Retrieving Question Pooler Output")
question_vector = dpr_question_encoder_model(dpr_question_tokens).pooler_output

similarity_scores = tf.einsum("nd,npd->np", question_embeddings, passage_embeddings)
print(f"Similarity Score {similarity_scores}")

Tokenizing Context...
Tokenizing Questions...
Retreiving Context Pooler Output


ResourceExhaustedError: ignored