# Step 4: Claim label Classification

# Readme
*This notebook responsible for classify the claim label base on the evidence we found on previous stages*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization, TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Lambda, Dot, Reshape, GlobalAveragePooling1D, Flatten
from tensorflow.keras import Model, Input, optimizers, layers, metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from tensorflow.keras import backend as K

import spacy
import string
import time
from collections import Counter
import string
import ast

Load the result from step 3

In [16]:
train = pd.read_csv("data_folder/train_retrival_result_with_transformer.csv")
evidence = pd.read_csv("data_folder/evidence.csv")
dev = pd.read_csv("data_folder/dev_retrival_result_with_transformer.csv")
test = pd.read_csv("data_folder/test_retrival_result_with_transformer.csv")

In [17]:
dev['predicted_add_evidence_transformer'] = dev['predicted_add_evidence_transformer'].apply(lambda x: ast.literal_eval(x))
train['predicted_add_evidence_transformer'] = train['predicted_add_evidence_transformer'].apply(lambda x: ast.literal_eval(x))
test['predicted_add_evidence_transformer'] = test['predicted_add_evidence_transformer'].apply(lambda x: ast.literal_eval(x))
dev['predicted_rm_evidence_transformer'] = dev['predicted_rm_evidence_transformer'].apply(lambda x: ast.literal_eval(x))
train['predicted_rm_evidence_transformer'] = train['predicted_rm_evidence_transformer'].apply(lambda x: ast.literal_eval(x))
test['predicted_rm_evidence_transformer'] = test['predicted_rm_evidence_transformer'].apply(lambda x: ast.literal_eval(x))
dev['predict_evidence_bm25'] = dev['predict_evidence_bm25'].apply(lambda x: ast.literal_eval(x))
train['predict_evidence_bm25'] = train['predict_evidence_bm25'].apply(lambda x: ast.literal_eval(x))
test['predict_evidence_bm25'] = test['predict_evidence_bm25'].apply(lambda x: ast.literal_eval(x))

Create our final predicted evidence column by kept top 4 evidences found by BM25 and adding first top evidences found by transformer if its not found by BM25

In [18]:
def combine_evidences(row):
    # Get the first element from the predicted_add_evidence_transformer list
    if row['predicted_add_evidence_transformer']:
        first_add_evidence = row['predicted_add_evidence_transformer'][0]
    else:
        first_add_evidence = None
    # Get the BM25 predicted evidences and initialize a new list if it's None
    bm25_evidences = row['predict_evidence_bm25'][:3] if row['predict_evidence_bm25'] is not None else []
    
    # Check if the first element from predicted_add_evidence_transformer is not in predict_evidence_bm25
    if first_add_evidence is not None and first_add_evidence not in bm25_evidences:
        bm25_evidences.append(first_add_evidence)
    
    return bm25_evidences

In [19]:
train['predicted_evidence'] = train.apply(combine_evidences, axis=1)
dev['predicted_evidence'] = dev.apply(combine_evidences, axis=1)
test['predicted_evidence'] = test.apply(combine_evidences, axis=1)

In [20]:
def print_retrival_result(df):
    evidence_recall_scores = []
    evidence_precision_scores = []
    evidence_fscore_scores = []
    
    # Iterate over each row in the dev DataFrame to evaluate evidence retrieval
    for index, row in df.iterrows():
        true_evidences = set(ast.literal_eval(row['evidences']))    
        predicted_evidences = set(row['predicted_evidence'])
        # Initialize counters for correct predictions
        evidence_correct = len(true_evidences & predicted_evidences)
        
        # Calculate recall, precision, and F-score
        if len(true_evidences) > 0 and len(predicted_evidences) > 0:
            evidence_recall = evidence_correct / len(true_evidences)
            evidence_precision = evidence_correct / len(predicted_evidences)
            if evidence_recall + evidence_precision > 0:
                evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall)
            else:
                evidence_fscore = 0.0
        else:
            evidence_recall = 0.0
            evidence_precision = 0.0
            evidence_fscore = 0.0
    
        # Store the scores
        evidence_recall_scores.append(evidence_recall)
        evidence_precision_scores.append(evidence_precision)
        evidence_fscore_scores.append(evidence_fscore)
        
    # Calculate mean scores across all instances
    mean_recall = np.mean(evidence_recall_scores)
    mean_precision = np.mean(evidence_precision_scores)
    mean_fscore = np.mean(evidence_fscore_scores)
    
    # Output the aggregate performance
    print(f"Average Evidence Recall    = {mean_recall}")
    print(f"Average Evidence Precision = {mean_precision}")
    print(f"Average Evidence F-Score   = {mean_fscore}")
print_retrival_result(dev)
print_retrival_result(train)

Average Evidence Recall    = 0.18333333333333335
Average Evidence Precision = 0.1412337662337662
Average Evidence F-Score   = 0.147979797979798
Average Evidence Recall    = 0.4887757871878393
Average Evidence Precision = 0.3319082519001086
Average Evidence F-Score   = 0.363018975233959


In [21]:
import nltk
def remove_punctuation(input_text):
    # Lowercase the input text to standardize it
    input_text = input_text.lower()
    
    # Remove punctuation using a translation table
    translator = str.maketrans('', '', string.punctuation)
    input_text = input_text.translate(translator)
    
    # Tokenize the text into words by splitting on whitespace
    tokens = input_text.split()
    
    # Join words back into one string and return
    return ' '.join(tokens)

# Apply the function to your dataframes
evidence['evidence_text'] = evidence['evidence_text'].apply(remove_punctuation)

Concat the claim text with all of its predicted evidences

The parameter include_all_ground_truth_ev indicate that whether the training process will be trained on true evidence or predicted evidence

In [22]:
def concat_text(row, include_all_ground_truth_ev = True):
    concatenated_text = "<CLS> " + row['claim_text']
    for ev_index in ast.literal_eval(row['evidences']) if include_all_ground_truth_ev else row['predicted_evidence']:
        concatenated_text += " <SEP> " + evidence.iloc[ev_index].evidence_text
    return concatenated_text

The result of the model predicted on the ground truth evidence could be reproduce if change the following parameter to true

In [23]:
train["concatenated_text"] = train.apply(lambda x: concat_text(x, False), axis=1)
dev["concatenated_text"] = dev.apply(lambda x:concat_text(x, False), axis=1)
test["concatenated_text"] = test.apply(lambda x:concat_text(x, False), axis=1)

In [24]:
# Define file paths
sequence_length = 256
print("Start processing!")
# Concatenate priority texts
priority_texts = pd.concat([train['concatenated_text'], dev['concatenated_text'], test['concatenated_text']])

# Create the TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    output_mode="int",
    output_sequence_length=sequence_length
)

# Adapt the vectorization layer on priority texts first
vectorize_layer.adapt(priority_texts)
print("Vocabulary size on all claim text:", len(vectorize_layer.get_vocabulary()))
max_features = len(vectorize_layer.get_vocabulary())
# Encode texts
train_encoded = vectorize_layer(train["concatenated_text"].to_numpy())
dev_encoded = vectorize_layer(dev["concatenated_text"].to_numpy())
test_encoded = vectorize_layer(test["concatenated_text"].to_numpy())
print("Processed and encoded data.")

Start processing!
Vocabulary size on all claim text: 12145
Processed and encoded data.


In [25]:
from sklearn.preprocessing import LabelEncoder

# Assuming train_for_multitask is your DataFrame
label_encoder = LabelEncoder()

# Convert categorical claim labels to integers
train_label = label_encoder.fit_transform(train['claim_label'])
dev_label = label_encoder.fit_transform(dev['claim_label'])

# Display unique classes and their mapping to check
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

{'DISPUTED': 0, 'NOT_ENOUGH_INFO': 1, 'REFUTES': 2, 'SUPPORTS': 3}


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [26]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)  # Pass any extra arguments to the superclass
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate
        })
        return config

# embedding model with lstm approach     
def create_cls_model(sequence_length, max_features, num_transformer_blocks, embedding_dim, num_heads, ff_dim, rate):
    input_layer = Input(shape=(sequence_length,), dtype="int64")
    
    # Embedding layer with positional encoding
    embedding_layer = Embedding(max_features, embedding_dim)
    x = embedding_layer(input_layer)
    # Adding positional encoding
    
    position_embedding = Embedding(input_dim=sequence_length, output_dim=embedding_dim)
    positions = tf.range(start=0, limit=sequence_length, delta=1)
    x += position_embedding(positions)
    
    # Transformer blocks
    for _ in range(num_transformer_blocks):
        x = TransformerBlock(embedding_dim, num_heads, ff_dim, rate)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dropout(0.1)(x)
    # Dense layers and output
    claim_output = Dense(4, activation='softmax', name="claim_label")(x)
    
    # Compile model
    model = Model(inputs=input_layer, outputs=claim_output)
    return model

# Initialize and compile the model as before
claim_cls = create_cls_model(sequence_length=sequence_length, max_features=max_features, num_transformer_blocks = 2,
                                         embedding_dim = 1024, num_heads=4, ff_dim=128, rate=0.1)
claim_cls.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 256)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 256, 1024)         12436480  
_________________________________________________________________
tf.__operators__.add_1 (TFOp (None, 256, 1024)         0         
_________________________________________________________________
transformer_block_2 (Transfo (None, 256, 1024)         17057920  
_________________________________________________________________
transformer_block_3 (Transfo (None, 256, 1024)         17057920  
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 1024)              0   

In [27]:
claim_cls.compile(
    optimizer=optimizers.Adam(1e-5),
    loss={
        'claim_label': 'sparse_categorical_crossentropy'
    },
    metrics={
        'claim_label': ['accuracy']
    }
)

In [29]:
claim_cls.fit(train_encoded, train_label, epochs=10, validation_data=(dev_encoded, dev_label))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b40737a670>

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [60]:
predictions = claim_cls.predict(test_encoded)
predicted_labels_indices = np.argmax(predictions, axis=1)

# Decode predictions back to original labels
predicted_labels = label_encoder.inverse_transform(predicted_labels_indices)

# Attach predictions to the DataFrame
test['claim_label'] = predicted_labels

You might need to change the data path of test json file

In [61]:
import json
input_json_path = '../project-data/test-claims-unlabelled.json'
with open(input_json_path, 'r') as file:
    data = json.load(file)

ordered_json_keys = list(data.keys())

In [62]:
for idx, claim_id in enumerate(ordered_json_keys):
    row = test.iloc[idx]
    evidences_formatted = [f"evidence-{ev}" for ev in row['predicted_evidence']]
    data[claim_id]['claim_label'] = row['claim_label']
    data[claim_id]['evidences'] = evidences_formatted

In [63]:
output_json_path = 'test-output.json'
with open(output_json_path, 'w') as file:
    json.dump(data, file, indent=4)