<a href="https://colab.research.google.com/github/sysung/w266-final-project/blob/master/roberta/Identifying_MisInformation_(BERT%2BCNN_Premise_Articles).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download packages and dataset

In [None]:
!curl -O https://cs.uwaterloo.ca/~ppoupart/fact-check/WatClaimCheck.tar.gz
!tar -xzf WatClaimCheck.tar.gz

curl: /usr/local/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1107M  100 1107M    0     0   112M      0  0:00:09  0:00:09 --:--:--  116M


In [None]:
!pip install transformers

[0m

In [None]:
!pip install simpletransformers

[0m

In [None]:
!pip install tensorboardX

[0m

In [None]:
!pip install condacolab

[0m

In [None]:
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
!conda install -c pytorch faiss-gpu

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / - \ | / - \ | / - \ | / - \ | done


  current version: 23.1.0
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



# All requested packages 

# Import and set up notebook

In [None]:
# from transformers import BertTokenizerFast, TFBertModel
from transformers import DistilBertTokenizerFast, TFDistilBertModel
from simpletransformers.retrieval import RetrievalModel, RetrievalArgs
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from itertools import chain

import datetime as dt
import json
import keras
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
import tensorflow as tf

from google.colab import drive
drive.mount('/content/drive')

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Num GPUs Available:  1


In [None]:
def explode_dictionary(pd_df: pd.DataFrame, field: str) -> pd.DataFrame:
    '''Explodes a dictionary within a column as multiple columns and then drops the parent field'''
    return pd.concat([pd_df.drop(field, axis=1), pd_df[field].apply(pd.Series)], axis=1)


def clean_pd_df(pd_df: pd.DataFrame) -> pd.DataFrame:
    '''Extracts field from dataframe, casts review_date to datetime and removes id'''

    print("Extracting fields from metadata")
    pd_df = explode_dictionary(pd_df, 'metadata')

    print("Extracting fields from label")
    pd_df = explode_dictionary(pd_df, 'label')

    # Set claim_date as review_date if review_date does not exist
    pd_df['review_date'].fillna(pd_df['claim_date'], inplace=True)

    # Convert review_date to date time
    pd_df['review_date'] = pd.to_datetime(pd_df['review_date'].str.split('T', expand=True)[0])

    # Drop ID
    pd_df = pd_df.drop(columns=['id'])

    return pd_df


def download_dataset(dataset_fp: str) -> tuple:
    '''
    Reads the WatClaimCheck dataset from the filepath and returns a pandas dataframe of the train, valid, and test datasets
    Reads the WatClaimCheck dataset from the filepath and returns a pandas dataframe of the train, valid, and test datasets

    Parameters:
    dataset_fp (str): Filepath of dataset

    Returns:
    tuple: Train, Valid, Test Pandas Dataframes
    '''

    # Get full path of json files
    train_json_fp = os.path.join(dataset_fp, 'train.json')
    valid_json_fp = os.path.join(dataset_fp, 'valid.json')
    test_json_fp = os.path.join(dataset_fp, 'test.json')

    # Get pandas dataframe from json
    train_pd_df = pd.read_json(train_json_fp)
    valid_pd_df = pd.read_json(valid_json_fp)
    test_pd_df = pd.read_json(test_json_fp)

    # Get all of the data corresponding to the metadata and labels
    clean_train_pd_df = clean_pd_df(train_pd_df)
    clean_valid_pd_df = clean_pd_df(valid_pd_df)
    clean_test_pd_df = clean_pd_df(test_pd_df)

    return clean_train_pd_df, clean_valid_pd_df, clean_test_pd_df

def download_article(dataset_fp: str, article_file: str) -> dict:
    '''
    Downloads an article from the WatClaimCheck Dataset

    Parameters:
    dataset_fp   (str): Filepath of dataset
    article_file (str): Filepath of article

    Returns:
    dict: Article JSON
    '''

    # Read articles as json
    full_article_fp = open(os.path.join(dataset_fp, 'articles', article_file))
    json_data = json.load(full_article_fp)
    full_article_fp.close()

    return json_data

# Obtain data and Preprocess

In [None]:
DATASET_FP = "./WatClaimCheck_dataset"

# Retrieve dataset
train_df, valid_df, test_df = download_dataset(DATASET_FP)

Extracting fields from metadata
Extracting fields from label
Extracting fields from metadata
Extracting fields from label
Extracting fields from metadata
Extracting fields from label


In [None]:
# Creates the query sentence to ask the model
train_df['query_text'] = 'Is the claim "' + train_df['claim'] + '" true, false, or partially true/false?'
valid_df['query_text'] = 'Is the claim "' + valid_df['claim'] + '" true, false, or partially true/false?'
test_df['query_text'] = 'Is the claim "' + test_df['claim'] + '" true, false, or partially true/false?'

# Creates the passages that are associated with the query
train_df['gold_passage'] = train_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x)))
valid_df['gold_passage'] = valid_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x)))
test_df['gold_passage'] = test_df['review_article'].apply(lambda x: ' '.join(download_article(DATASET_FP, x)))

# Creates additional premise passages that are used during inference
train_df['premise_articles_content'] = train_df['premise_articles'].apply(lambda x: ' '.join([' '.join(download_article(DATASET_FP, file)) for file in x.values()]))
valid_df['premise_articles_content'] = valid_df['premise_articles'].apply(lambda x: ' '.join([' '.join(download_article(DATASET_FP, file)) for file in x.values()]))
test_df['premise_articles_content']  = test_df['premise_articles'].apply(lambda x: ' '.join([' '.join(download_article(DATASET_FP, file)) for file in x.values()]))

---
# Train DPR Model

## Set model parameters

In [None]:
model_type = "dpr"
context_name = "facebook/dpr-ctx_encoder-single-nq-base"
query_name = "facebook/dpr-question_encoder-single-nq-base"

model_args = RetrievalArgs()
model_args.hard_negatives = True
model_args.include_title = False
model_args.evaluate_during_training = True
model_args.overwrite_output_dir = True
model_args.num_train_epochs = 10

is_cuda_gpu_available = len(tf.config.list_physical_devices('GPU')) > 0

# dpr_model = RetrievalModel(
#     model_type=model_type,
#     context_encoder_name=context_name,
#     query_encoder_name=query_name,
#     use_cuda=is_cuda_gpu_available,
#     args=model_args
# )

# Code to obtain a model checkpoint
saved_dpr_checkpoint = "/content/drive/MyDrive/w266_final_project_checkpoints/roberta_premise_articles/dpr" + "/2023-11-29_01:18:13.902303" + "/checkpoint-33720-epoch-10"
dpr_model = RetrievalModel(
    "dpr", saved_dpr_checkpoint
)

## Create hard negatives for DPR

In [None]:
train_hard_df = dpr_model.build_hard_negatives(
    queries=train_df['query_text'].to_list(),
    passage_dataset=train_df['gold_passage'].to_list(),
    retrieve_n_docs=1
)

valid_hard_df = dpr_model.build_hard_negatives(
    queries=valid_df['query_text'].to_list(),
    passage_dataset=valid_df['gold_passage'].to_list(),
    retrieve_n_docs=1
)

test_hard_df = dpr_model.build_hard_negatives(
    queries=test_df['query_text'].to_list(),
    passage_dataset=test_df['gold_passage'].to_list(),
    retrieve_n_docs=1
)

train_df['hard_negative'] = train_hard_df
valid_df['hard_negative'] = valid_hard_df
test_df['hard_negative'] = test_hard_df

Map:   0%|          | 0/26976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26976 [00:00<?, ? examples/s]

  0%|          | 0/27 [00:00<?, ?it/s]

Generating query embeddings: 0it [00:00, ?it/s]

Retrieving docs:   0%|          | 0/53 [00:00<?, ?it/s]

Generating query embeddings: 0it [00:00, ?it/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Generating query embeddings: 0it [00:00, ?it/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

## Train model

In [None]:
string_date = str(dt.datetime.now()).replace(' ', '_')
dpr_checkpoint_dir = f"/content/drive/MyDrive/w266_final_project_checkpoints/roberta_premise_articles/dpr/{string_date}/"
print(dpr_checkpoint_dir)

training_details = dpr_model.train_model(
    train_df[['query_text', 'gold_passage', 'hard_negative']],
    eval_data = valid_df[['query_text', 'gold_passage', 'hard_negative']],
    output_dir = dpr_checkpoint_dir,
    additional_eval_passages = valid_df['premise_articles_content'].to_list(),
    show_running_loss = True
)

eval_results = dpr_model.eval_model(
    test_df[['query_text', 'gold_passage', 'hard_negative']],
    additional_passages = test_df['premise_articles_content'].to_list()
)

/content/drive/MyDrive/w266_final_project_checkpoints/roberta_premise_articles/dpr/2023-11-29_01:18:13.902303/


Map:   0%|          | 0/26976 [00:00<?, ? examples/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/3372 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Map:   0%|          | 0/3372 [00:00<?, ? examples/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

Running Evaluation:   0%|          | 0/422 [00:00<?, ?it/s]

  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()


Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
training_details

(33720,
 {'global_step': [2000,
   3372,
   4000,
   6000,
   6744,
   8000,
   10000,
   10116,
   12000,
   13488,
   14000,
   16000,
   16860,
   18000,
   20000,
   20232,
   22000,
   23604,
   24000,
   26000,
   26976,
   28000,
   30000,
   30348,
   32000,
   33720],
  'eval_loss': [6.213545456316798,
   6.051113522307003,
   6.544676291292878,
   7.335301706152505,
   5.919817985799075,
   5.888542802412928,
   6.905429496089994,
   6.933301827918862,
   7.055865874222669,
   6.582511409622798,
   6.9505142086490075,
   6.699393703897982,
   6.488324454822246,
   6.5977166577538044,
   6.839799726320104,
   7.194737570686928,
   6.60017035922733,
   6.7660599432850335,
   7.119285951646583,
   6.970786365216942,
   6.999168344336381,
   7.136280821539215,
   6.5773033285028,
   6.934503869846534,
   6.887710414226587,
   6.9298609622281875],
  'train_loss': [1.1730856895446777,
   1.5399014949798584,
   1.3237184286117554,
   1.1451717615127563,
   0.8881859183311462,
   1.5

## Run predictions on train, valid, and test data sets

In [None]:
train_predicted_passages, train_doc_ids, train_doc_vectors, train_doc_dicts = dpr_model.predict(
    to_predict = train_df['query_text'].to_list(),
    prediction_passages = train_df['premise_articles_content'].to_list(),
    retrieve_n_docs=1
)

valid_predicted_passages, valid_doc_ids, valid_doc_vectors, valid_doc_dicts = dpr_model.predict(
    to_predict = valid_df['query_text'].to_list(),
    prediction_passages = valid_df['premise_articles_content'].to_list(),
    retrieve_n_docs=1
)

test_predicted_passages, test_doc_ids, test_doc_vectors, test_doc_dicts = dpr_model.predict(
    to_predict = test_df['query_text'].to_list(),
    prediction_passages = test_df['premise_articles_content'].to_list(),
    retrieve_n_docs=1
)

Map:   0%|          | 0/26976 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/26976 [00:00<?, ? examples/s]

  0%|          | 0/27 [00:00<?, ?it/s]

Generating query embeddings: 0it [00:00, ?it/s]

Retrieving docs:   0%|          | 0/53 [00:00<?, ?it/s]

Generating query embeddings: 0it [00:00, ?it/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

Generating query embeddings: 0it [00:00, ?it/s]

Retrieving docs:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
train_df['predicted_passages'] = list(chain(*train_predicted_passages))
valid_df['predicted_passages'] = list(chain(*valid_predicted_passages))
test_df['predicted_passages'] = list(chain(*test_predicted_passages))

---
# Train Transformer (BERT + CNN)


In [None]:
train_df['claimant'].fillna('', inplace=True)
valid_df['claimant'].fillna('', inplace=True)
test_df['claimant'].fillna('', inplace=True)

train_df['transformer_input'] = "Claim: " + train_df['claim'] + "\nClaimant" + train_df['claimant'] + "\nEvidence" + train_df['predicted_passages']
valid_df['transformer_input'] = "Claim: " + valid_df['claim'] + "\nClaimant" + valid_df['claimant'] + "\nEvidence" + valid_df['predicted_passages']
test_df['transformer_input'] = "Claim: " + test_df['claim'] + "\nClaimant" + test_df['claimant'] + "\nEvidence" + test_df['predicted_passages']

In [None]:
checkpoint = 'distilbert-base-cased'
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
distilbert_model = TFDistilBertModel.from_pretrained(checkpoint)

# checkpoint = 'bert-base-cased'
# bert_tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
# bert_model = TFBertModel.from_pretrained(checkpoint)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
dropout=0.3
hidden_size=200
cnn_filters=128
cnn_kernel_size=3
learning_rate=0.00005
msl = 100

input_ids = tf.keras.layers.Input(shape=(msl,), dtype=tf.int64, name='input_ids_layer')
attention_mask = tf.keras.layers.Input(shape=(msl,), dtype=tf.int64, name='attention_mask_layer')

distilbert_inputs = {'input_ids': input_ids,
                    'attention_mask': attention_mask}

# bert_model.trainable = True
# bert_out = bert_model(bert_inputs)
# pooler_token = bert_out[1]
# cnn_layer = tf.keras.layers.Conv1D(filters=cnn_filters, kernel_size=cnn_kernel_size, activation='relu')(bert_out[0])

distilbert_model.trainable = True
distilbert_out = distilbert_model(distilbert_inputs)
distilbert_out[0]

cnn_layer = tf.keras.layers.Conv1D(filters=cnn_filters, kernel_size=cnn_kernel_size, activation='relu')(distilbert_out[0])

cnn_max_pooling = tf.keras.layers.GlobalMaxPooling1D()(cnn_layer)

hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cnn_max_pooling)
hidden = tf.keras.layers.Dropout(dropout)(hidden)

classification = tf.keras.layers.Dense(3, activation='softmax', name='classification_layer')(hidden)

classification_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[classification])

classification_model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.F1Score(average='macro')
    ]
)

classification_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 100)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 100)]                0         []                            
 r)                                                                                               
                                                                                                  
 tf_distil_bert_model (TFDi  multiple                     6519091   ['attention_mask_layer[0][0]',
 stilBertModel)                                           2          'input_ids_layer[0][0]'] 

In [None]:
# Set variables
batch_size = 12
epochs = 10
string_date = str(dt.datetime.now()).replace(' ', '_')
checkpoint_dir = f"/content/drive/MyDrive/w266_final_project_checkpoints/bert_premise_articles/{string_date}"
checkpoint_path = checkpoint_dir + "weights.{epoch:02d}-{val_loss:.2f}-{val_f1_score:.2f}.hdf5"
encoder = OneHotEncoder(sparse_output=False)

# Create Train and Validation inputs
train_inputs = distilbert_tokenizer.batch_encode_plus(
    train_df['transformer_input'].to_list(),
    max_length = msl,
    padding="max_length",
    truncation=True,
    return_tensors='tf'
)

train_labels = encoder.fit_transform(np.array(train_df['rating']).reshape(-1, 1))

valid_inputs = distilbert_tokenizer.batch_encode_plus(
    valid_df['transformer_input'].to_list(),
    max_length = msl,
    padding="max_length",
    truncation=True,
    return_tensors='tf'
)

valid_labels = encoder.fit_transform(np.array(valid_df['rating']).reshape(-1, 1))

# Train model
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True)

model_history = classification_model.fit(
    [train_inputs.input_ids, train_inputs.attention_mask], train_labels,
    validation_data=([valid_inputs.input_ids, valid_inputs.attention_mask], valid_labels),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[cp_callback]
)

In [None]:
history_keys = list(model_history.history.keys())

train_loss = model_history.history[history_keys[0]]
train_cat_acc = model_history.history[history_keys[1]]
train_prec = model_history.history[history_keys[2]]
train_recall = model_history.history[history_keys[3]]
train_f1 = model_history.history[history_keys[4]]

valid_loss = model_history.history[history_keys[5]]
valid_cat_acc = model_history.history[history_keys[6]]
valid_prec = model_history.history[history_keys[7]]
valid_recall = model_history.history[history_keys[8]]
valid_f1 = model_history.history[history_keys[9]]
# Create subplots
fig, axes = plt.subplots(1, 5, figsize=(16, 3))
epochs = range(1, len(train_loss) + 1)

axes[0].plot(epochs, train_loss, '-', label='Train')
axes[0].plot(epochs, valid_loss, '-', label='Validation')
axes[0].set_title('Loss')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('Loss')
axes[0].legend()

axes[1].plot(epochs, train_cat_acc, '-', label='Train')
axes[1].plot(epochs, valid_cat_acc, '-', label='Validation')
axes[1].set_title('Accuracy')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Accuracy')
axes[1].legend()

axes[2].plot(epochs, train_prec, '-', label='Training precision')
axes[2].plot(epochs, valid_prec, '-', label='Validation precision')
axes[2].set_title('Training and Validation Precision')
axes[2].set_xlabel('Epochs')
axes[2].set_ylabel('Precision')
axes[2].legend()

axes[3].plot(epochs, train_recall, '-', label='Training recall')
axes[3].plot(epochs, valid_recall, '-', label='Validation recall')
axes[3].set_title('Training and Validation Recall')
axes[3].set_xlabel('Epochs')
axes[3].set_ylabel('Recall')
axes[3].legend()

axes[4].plot(epochs, train_f1, '-', label='Train')
axes[4].plot(epochs, valid_f1, '-', label='Validation')
axes[4].set_title('F1 Score')
axes[4].set_xlabel('Epochs')
axes[4].set_ylabel('F1 Score')
axes[4].legend()

plt.tight_layout()
plt.show()

string_date = str(dt.datetime.now()).replace(' ', '_')
checkpoint_dir = "/content/drive/MyDrive/w266_final_project_checkpoints/bert_premise_articles"
plt.savefig(f"{checkpoint_dir}/roberta_review_article_{string_date}_history.png")