In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
import random
import re
import torch

# Initialize the tokenizer and model
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl")
run = 2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# # Load the COPA data from a JSONL file
# def load_copa_data(file_path):
#     with open(file_path, 'r') as file:
#         return [json.loads(line) for line in file]

# Function to load the dataset from a JSON file
def load_copa_data(file_path, sample_size=100000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if sample_size >= len(data):
        return data

    return random.sample(data, sample_size)

# Format the COPA query for the T5 model
def format_copa_query(item):
    premise = item['context']
    hypothesis1 = item['choice_id: 0']
    hypothesis2 = item['choice_id: 1']
    hypothesis3 = item['choice_id: 2']
    ask_for = item['ask-for']
    task = "cause" if ask_for == "cause" else "effect"
    return f"Given the premise: {premise}; what is the {task}? Is it hypothesis 1: {hypothesis1}; OR hypothesis 2: {hypothesis2}; OR hypothesis 3: {hypothesis3}?"

# Send the query to the T5 model and get the response
def query_t5(query):
    inputs = tokenizer.encode(query, return_tensors="pt")
    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response



def normalize_text(text):
    """Normalize text by removing punctuation, converting to lowercase, and trimming spaces."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

saved_responses = []
def is_response_correct(response, item):
    correct_hypothesis = item['choice_id: ' + str(item['label'])]
    response_text = normalize_text(response)
    correct_hypothesis_normalized = normalize_text(correct_hypothesis)

    saved_responses.append({
            'index': item['index'],
            'premise': item['context'],
            'task': item['ask-for'],
            'hypothesis1': item['choice_id: 0'],
            'hypothesis2': item['choice_id: 1'],
            'hypothesis3': item['choice_id: 2'],
            'label': item['label'],
            'generated_response': response_text
        })

    # Direct comparison for exact matches after normalization
    if response_text == correct_hypothesis_normalized:
        return True

    # Handling direct references to "hypothesis 1" or "hypothesis 2"
    if "hypothesis 1" in response_text or "hypothesis 2" in response_text or "hypothesis 3" in response_text:
        if "hypothesis 1" in response_text:
            hypothesis_number = 1
        elif"hypothesis 2" in response_text:
            hypothesis_number = 2
        else:
            hypothesis_number = 3
        correct_hypothesis_number = item['label'] + 1
        return hypothesis_number == correct_hypothesis_number

    # Calculate match ratio for a more nuanced comparison
    key_phrases = correct_hypothesis_normalized.split()
    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    # Consider a response correct if a significant portion of key phrases match
    return match_ratio > 0.5  # Adjust this threshold based on experimentation and desired strictness


def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    return [accuracy, f1, precision, recall]

responses_file = '/content/drive/MyDrive/Datasets/causalnet_t5_responses_' + str(run) + '.jsonl'

def evaluate_accuracy(data):
    correct_count = 0
    total_items = len(data)

    try:
        # If model has already been run
        with open(responses_file, 'r', encoding='utf-8') as file:
            saved_responses = [json.loads(line) for line in file]
        print("Loaded saved responses.")
    except FileNotFoundError:
        for index, item in enumerate(data):
            query = format_copa_query(item)
            print(f"\nProcessing item {index + 1}/{total_items}...")

            response = query_t5(query)
            print(f"Generated Response: {response}")
            acts.append(item['label'])

            correct_hypothesis = item['choice_id: ' + str(item['label'])]
            print(f"Correct Hypothesis: hypothesis {item['label'] + 1}, {correct_hypothesis}")

            is_correct = is_response_correct(response, item)

            if is_correct:
                print(f"Item {index + 1}: Correct")
                correct_count += 1
                preds.append(item['label'])
            else:
                print(f"Item {index + 1}: Incorrect")
                preds.append(0 if item['label'] else 1)

        accuracy = correct_count / total_items
        print(f"\nAccuracy: {accuracy * 100:.2f}%")
        return accuracy


# def is_response_correct_loaded(item, response):
#     try:
#         pred_hyp = int(response.split(' ')[-1])
#     except ValueError:
#         pred_hyp = -1
#     return pred_hyp == item['label'] + 1

# Load the COPA data
copa_data = load_copa_data('/content/drive/MyDrive/Datasets/causalnetcd.json')

# Evaluate accuracy
# If model has already been run
# with open(responses_file, 'r', encoding='utf-8') as file:
#     saved_responses = [json.loads(line) for line in file]
# print("Loaded saved responses.")
# preds, golds = [], []
# for item in saved_responses:
#     if is_response_correct_loaded(item, item['generated_response']):
#         preds.append(item['label'])
#     else:
#         preds.append(1 - item['label'])  # Assuming binary labels (0 and 1)
#     golds.append(item['label'])

# sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(golds, preds)
# print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
# print(f"F1: {f1val * 100:.3f}%")
# print(f"Precision: {precisionval * 100:.3f}%")
# print(f"Recall: {recallval * 100:.3f}%")
# End if model has already been run

saved_responses = []
acts, preds = [], []  # acts: ground truth
accuracy = evaluate_accuracy(copa_data)

sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(acts, preds)
print(f"Accuracy: {accuracy * 100:.3f}%")
print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
print(f"F1: {f1val * 100:.3f}%")
print(f"Precision: {precisionval * 100:.3f}%")
print(f"Recall: {recallval * 100:.3f}%")

with open(responses_file, 'w', encoding='utf-8') as file:
        for response in saved_responses:
            file.write(json.dumps(response) + '\n')
print('Responses saved')


Processing item 1/1000...
Generated Response: hypothesis 2
Correct Hypothesis: hypothesis 2, The celebrity's influence changed teenagers' perceptions more significantly than expected.
Item 1: Correct

Processing item 2/1000...
Generated Response: hypothesis 2
Correct Hypothesis: hypothesis 2, The road improved access to educational resources and opportunities.
Item 2: Correct

Processing item 3/1000...
Generated Response: hypothesis 3
Correct Hypothesis: hypothesis 3, Improved agricultural efficiency made rural living more sustainable, reducing the need for urban migration.
Item 3: Correct

Processing item 4/1000...
Generated Response: hypothesis 2
Correct Hypothesis: hypothesis 2, The rise in public transportation usage reduced the number of cars on the road.
Item 4: Correct

Processing item 5/1000...
Generated Response: hypothesis 2
Correct Hypothesis: hypothesis 2, The jellyfish blooms could have deterred tourists despite the beach's accolades.
Item 5: Correct

Processing item 6/10

In [None]:
from google.colab import runtime
runtime.unassign()