# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [2]:
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Load the training data
with open('train-claims.json') as file:
    train_data = json.load(file)

# Collect claims texts and labels
claims = [details['claim_text'] for details in train_data.values()]
labels = [details['claim_label'] for details in train_data.values()]

# Create a mapping for labels to integers
unique_labels = list(set(labels))
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

# Convert labels to indices
label_indices = [label_to_index[label] for label in labels]

# Convert label indices to one-hot encoding
label_seq = to_categorical(label_indices)

# Tokenize and pad claims
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(claims)
sequences = tokenizer.texts_to_sequences(claims)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=50)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, label_seq, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, label_seq, test_size=0.1, random_state=42)

In [12]:
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the training data and evidence data
with open('train-claims.json') as file:
    train_data = json.load(file)

with open('evidence.json') as file:  # Ensure the path to evidence.json is correct
    evidence_data = json.load(file)

# Function to fetch evidence texts based on evidence IDs
def get_evidence_text(evidence_ids):
    # Retrieve and concatenate evidence texts corresponding to the evidence IDs
    return ' '.join([evidence_data.get(eid, '') for eid in evidence_ids])

# Prepare data by combining claims with their corresponding evidence
combined_texts = []
labels = []
for claim_id, claim_details in train_data.items():
    # Fetch the evidence texts for the claim
    evidence_text = get_evidence_text(claim_details['evidences'])
    # Combine the claim text with the evidence texts
    combined_text = claim_details['claim_text'] + " " + evidence_text
    combined_texts.append(combined_text)
    labels.append(claim_details['claim_label'])

# Convert labels to categorical
unique_labels = list(set(labels))
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
label_indices = [label_to_index[label] for label in labels]
label_seq = to_categorical(label_indices)

# Tokenize and pad combined texts
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(combined_texts)
sequences = tokenizer.texts_to_sequences(combined_texts)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=50)

# No need to split the data as we will use the entire dataset for training
X_train = padded_sequences
y_train = label_seq


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the model
with tf.device('/cpu:0'):
    model = Sequential([
        Embedding(input_dim=20000, output_dim=128, input_length=50,mask_zero=True),
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dense(64, activation='relu'),
        Dense(label_seq.shape[1], activation='softmax')  # Output layer
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    # Train the model
    model.fit(X_train, y_train, epochs=10)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 50, 128)           2560000   
                                                                 
 lstm (LSTM)                 (None, 50, 128)           131584    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense_11 (Dense)            (None, 64)                4160      
                                                                 
 dense_12 (Dense)            (None, 4)                 260       
                                                                 
Total params: 2,745,412
Trainable params: 2,745,412
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [25]:
import json

# Load the development claims
with open('dev-claims.json') as file:
    dev_data = json.load(file)

# Assuming 'evidence.json' has been loaded as evidence_data (you should have this from your earlier steps)
# Reuse the get_evidence_text function if available, else define it again
def get_evidence_text(evidence_ids):
    return ' '.join([evidence_data.get(eid, '') for eid in evidence_ids])

# Prepare the combined texts and labels for the development set
dev_combined_texts = []
dev_labels = []
for claim_id, claim_details in dev_data.items():
    evidence_text = get_evidence_text(claim_details['evidences'])
    combined_text = claim_details['claim_text'] + " " + evidence_text
    dev_combined_texts.append(combined_text)
    dev_labels.append(claim_details['claim_label'])

# Convert labels of the development set to categorical
dev_label_indices = [label_to_index[label] for label in dev_labels]
dev_label_seq = to_categorical(dev_label_indices)

# Tokenize and pad the development combined texts
dev_sequences = tokenizer.texts_to_sequences(dev_combined_texts)
dev_padded_sequences = pad_sequences(dev_sequences, padding='post', maxlen=50)

# Evaluate the model on the development set
with tf.device('/cpu:0'):
    evaluation = model.evaluate(dev_padded_sequences, dev_label_seq)
    print(f'Model Loss: {evaluation[0]}, Model Accuracy: {evaluation[1]}')


Model Loss: 1.9598561525344849, Model Accuracy: 0.4285714328289032


In [26]:
import numpy as np

# Get model predictions
with tf.device('/cpu:0'):
    predictions = model.predict(dev_padded_sequences)
    predicted_labels_indices = np.argmax(predictions, axis=1)

# Convert categorical labels back to label indices for comparison
true_labels_indices = np.argmax(dev_label_seq, axis=1)




In [27]:
misclassified_indices = np.where(predicted_labels_indices != true_labels_indices)[0]
# Assuming you have a way to map label indices back to label names
index_to_label = {v: k for k, v in label_to_index.items()}

for index in misclassified_indices[:10]:  # Limiting to the first 10 errors for initial review
    print("Claim ID:", list(dev_data.keys())[index])
    print("Claim Text:", dev_combined_texts[index])
    print("Predicted Label:", index_to_label[predicted_labels_indices[index]])
    print("True Label:", index_to_label[true_labels_indices[index]])
    print("----")

Claim ID: claim-752
Claim Text: [South Australia] has the most expensive electricity in the world. [citation needed] South Australia has the highest retail price for electricity in the country. "South Australia has the highest power prices in the world".
Predicted Label: NOT_ENOUGH_INFO
True Label: SUPPORTS
----
Claim ID: claim-375
Claim Text: when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod­uces 1.3 per cent of this 3 per cent, then no amount of emissions reductio­n here will have any effect on global climate. The 2011 UNEP Green Economy report states that "[a]agricultural operations, excluding land use changes, produce approximately 13 per cent of anthropogenic global GHG emissions. With a market share of 30% and (potentially) clean electricity, heat pumps could reduce global CO 2 emissions by 8% annually. In the modern era, emissions to the atmosphere from volcanoes are approximately 0.645 billion tonnes of CO 2 per year, whereas 

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*