### File for training the RoBERTa model 

Evidence Detection (ED):

Given a claim and a piece of evidence, determine if the evidence is relevant to the claim.
You will be provided with more than 23K claim-evidence pairs as training data, and almost
6K pairs as validation data.

Author : Vansh Goenka

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
# Download the stop words corpus if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
%env TF_USE_LEGACY_KERAS=1
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import re
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score
import numpy as np



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


env: TF_USE_LEGACY_KERAS=1


# Loading the data

In [None]:
# Load the training data from CSV
from google.colab import drive
drive.mount('/content/drive')

trial_data = pd.read_csv('/content/drive/My Drive/ED_trial.csv')
train_data = pd.read_csv('/content/drive/My Drive/train.csv')
dev_data = pd.read_csv('/content/drive/My Drive/dev.csv')
eval_data = pd.read_csv('/content/drive/My Drive/eval.csv')
dev_data['Evidence'] = dev_data['Evidence'].astype(str)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define a custom dataset

In [None]:
# this was the initial approach to preprocess the data, but was discarded since it was reducing the accuracy
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        claim = self.data.loc[idx, 'Claim']
        claim = preprocess(claim)
        evidence = self.data.loc[idx, 'Evidence']
        evidence = preprocess(evidence)
        label = self.data.loc[idx, 'label']
        encoding = self.tokenizer(claim, evidence, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }


# Pre-procesing the data

In [None]:
# had no positive affect while using pre-processed data - hence removed 

# Training

# Using RoBERTa as the primary model

Loading the model and the tokenizer

In [None]:
model_name = "roberta-base"

# Loading the model
roberta_model = transformers.TFRobertaModel.from_pretrained(model_name)
roberta_model.trainable = True

# Loading the tokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Creating RoBERTa tokenizer for further evaluation

In [None]:
def tokenizeRoberta(claims, evidences, tokenizer, max_length=120):
    # concatenatuing the claims and evidences into one input
    inputs = [c + '  ' + e for c, e in zip(np.array(evidences), np.array(claims))]

    # Applying tokenization
    inputs_after_tok = tokenizer(inputs,padding='max_length',truncation=True,max_length=max_length,return_tensors='tf')

    return {
        'input_ids': inputs_after_tok['input_ids'],
        'attention_mask': inputs_after_tok['attention_mask']
    }
# storing the tokenized data
trainRoberta_data = tokenizeRoberta(train_data['Claim'].values, train_data['Evidence'].values, tokenizer)
devRoberta_data = tokenizeRoberta(dev_data['Claim'].values, dev_data['Evidence'].values, tokenizer)

Fine-tuning with the data

In [None]:
import tensorflow as tf
import transformers
import numpy as np

max_length = 120
num_classes = 2



# Converting integer labels (0 and 1) into one-hot encoded format since its suitable for categorical classification
# train_labels = tf.one_hot(train_data['label'].values, depth=num_classes)
# dev_labels = tf.one_hot(dev_data['label'].values, depth=num_classes)
train_labels = tf.keras.utils.to_categorical(train_data['label'].values, num_classes=num_classes)
dev_labels = tf.keras.utils.to_categorical(dev_data['label'].values, num_classes=num_classes)

# using keras layers since its used to handle tokenized input sequences
input_word_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
input_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

# using the roberta model to get the results
roberta_output = roberta_model(input_word_ids, attention_mask=input_mask)
sequence_output = roberta_output.last_hidden_state

# BiLSTM layer with clear variable naming
lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))
bi_lstm_output = lstm_layer(sequence_output)

# Pooling with intermediate variables
avg_pooling = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm_output)
max_pooling = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm_output)

# Concatenation and subsequent layers
concatenated_features = tf.keras.layers.concatenate([avg_pooling, max_pooling])
dropout_layer = tf.keras.layers.Dropout(0.3)(concatenated_features)
output = tf.keras.layers.Dense(units=num_classes, activation='softmax')(dropout_layer)

# Model definition and compilation (same as before)
model_roberta = tf.keras.models.Model(inputs=[input_word_ids, input_mask], outputs=output)
# setting up the hyperparameters
model_roberta.compile(
    optimizer=tf.keras.optimizers.SGD(lr=1e-5),
    loss="categorical_crossentropy",
    metrics=["acc"],
)
# training the model with the given hyperparameters
model_roberta.fit(trainRoberta_data, train_labels,
                  validation_data=(trainRoberta_data, dev_labels),
                  epochs=8,
                  batch_size=32,
                  verbose=1)




initial Predictions

In [None]:
predictions_roberta = [np.argmax(i) for i in model_roberta.predict(devRoberta_data)]
accuracy = np.mean(predictions_roberta == dev_data['label'].values)
print("Roberta Accuracy:", accuracy)

Roberta Accuracy: 0.8822139723253459


Save the model to Google Drive

In [None]:
drive.mount('/content/drive')

# After your model is trained...
model_roberta.save('/content/drive/My Drive/RoBERTa_model.h5')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  saving_api.save_model(


In [None]:
import tensorflow as tf
import transformers
robertaModel = tf.keras.saving.load_model('/content/drive/My Drive/RoBERTa_model.h5', custom_objects={"TFRobertaModel": transformers.TFRobertaModel})

Precitions with all the metrics

In [None]:
predictions_roberta = [np.argmax(i) for i in robertaModel.predict(devRoberta_data)]
print(classification_report(predictions_roberta, dev_data['label']))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      4309
           1       0.79      0.78      0.78      1617

    accuracy                           0.88      5926
   macro avg       0.85      0.85      0.85      5926
weighted avg       0.88      0.88      0.88      5926

