In [None]:
!pip install -q accelerate==0.21.0 bitsandbytes==0.40.2
# !pip install -q transformers==4.30.0
# !pip install -q git+https://github.com/NousResearch/Llama.git

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import random
import transformers
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load model and tokenizer
model_name = "NousResearch/llama-2-7b-chat-hf"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model_pipeline = transformers.pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.float32,
    device_map="auto"  # Use GPU if available
)

In [None]:
def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    return [accuracy, f1, precision, recall]

# Check if the response correctly aligns with a given hypothesis
def is_response_correct(response_text, correct_hypothesis):
    response_text = response_text.lower()
    correct_hypothesis_lower = correct_hypothesis.lower()
    key_phrases = correct_hypothesis_lower.split()

    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    return match_ratio > 0.5  # Adjust this threshold as needed

# Load the ecare data from a JSONL file and randomly sample 1000 data points
def load_ecare_data(file_path, sample_size=100000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if sample_size >= len(data):
        return data

    return random.sample(data, sample_size)

# Function to calculate F1 score
def compute_f1_score(golds, preds):
    tp = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 1)
    fp = sum(1 for g, p in zip(golds, preds) if g == 0 and p == 1)
    fn = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 0)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1score

# Load the dataset
run = 1
file_path = '/content/drive/MyDrive/Datasets/causalnetcd.json'
responses_file = '/content/drive/MyDrive/Datasets/llama2_causalnet_responses_' + str(run) + '.jsonl'
dataset = load_ecare_data(file_path)
total_items = len(dataset)
preds, golds = [], []

# Generate predictions and collect gold labels
try:
    with open(responses_file, 'r', encoding='utf-8') as file:
        saved_responses = [json.loads(line) for line in file]
    print("Loaded saved responses.")
except FileNotFoundError:
    saved_responses = []
    for idx, item in enumerate(dataset):
        print(f"\nProcessing item {idx + 1}/{total_items}...")
        # query = f"{item['premise']} {item['hypothesis1']} or {item['hypothesis2']}?"
        task = "cause" if item["ask-for"] == "cause" else "effect"
        query = f"{item['context']}; what is the {task}?: 1. {item['choice_id: 0']} OR 2. {item['choice_id: 1']} OR 3. {item['choice_id: 2']}? Answer:"
#         query = f"Which is more likely to be the {task} for this {item['premise']} - {item['hypothesis1']}(0). or {item['hypothesis2']}(1). Select the right statement. Do not explain yourself."
        # query = f"Given the premise {item['premise']} and two possible hypotheses: {item['hypothesis1']} AND {item['hypothesis2']}, which one is the most probable? Pick one verbatim and do not explain yourself."
        response = model_pipeline(query, do_sample=True, top_k=10, top_p=0.9, temperature=0.2, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=150)
        generated_text = response[0]['generated_text']
        # Sometimes the model repeats the question
        if len(generated_text) > len(query):
            generated_text = generated_text[len(query):]
        print(f"Generated Response: {generated_text}")
        correct_hypothesis = item['choice_id: ' + str(item['label'])]
        print(f"Correct Hypothesis: hypothesis {item['label'] + 1}, {correct_hypothesis}")

        saved_responses.append({
            'index': item['index'],
            'premise': item['context'],
            'hypothesis1': item['choice_id: 0'],
            'hypothesis2': item['choice_id: 1'],
            'hypothesis3': item['choice_id: 2'],
            'label': item['label'],
            'generated_response': generated_text
        })
        golds.append(item['label'])

        if is_response_correct(generated_text, correct_hypothesis):
            print(f"Item {idx}: Correct")
            preds.append(item['label'])
        else:
            print(f"Item {idx}: Incorrect")
            preds.append(0 if item['label'] else 1)  # Assuming binary labels (0 and 1)

    # Save the generated responses
    with open(responses_file, 'w', encoding='utf-8') as file:
        for response in saved_responses:
            file.write(json.dumps(response) + '\n')
    print("Responses saved.")

# If loading from saved file
# golds, preds = [], []
# for idx, item in enumerate(saved_responses):
#     correct_hypothesis = item['hypothesis1'] if item['label'] == 0 else item['hypothesis2']
#     generated_text = item['generated_response']
#     golds.append(item['label'])
#     if is_response_correct(generated_text, correct_hypothesis):
#         print(f"Item {idx}: Correct")
#         preds.append(item['label'])
#     else:
#         print(f"Item {idx}: Incorrect")
#         preds.append(1 - item['label'])  # Assuming binary labels (0 and 1)
# End if loading from saved file


# Calculate F1 score and accuracy
f1score = compute_f1_score(golds, preds)
accuracy = sum(1 for g, p in zip(golds, preds) if g == p) / len(golds)

# Print the results
print(f"F1 Score: {f1score:.2f}")
print(f"Accuracy: {accuracy:.2f}")

print('---')

sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(golds, preds)
print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
print(f"F1: {f1val * 100:.3f}%")
print(f"Precision: {precisionval * 100:.3f}%")
print(f"Recall: {recallval * 100:.3f}%")

In [None]:
def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return [accuracy, f1, precision, recall]

# Check if the response correctly aligns with a given hypothesis
def is_response_correct(response_text, correct_hypothesis):
    response_text = response_text.lower()
    correct_hypothesis_lower = correct_hypothesis.lower()
    key_phrases = correct_hypothesis_lower.split()

    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    return match_ratio > 0.5  # Adjust this threshold as needed

# Load the ecare data from a JSONL file and randomly sample 1000 data points
def load_ecare_data(file_path, sample_size=100000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]

    if sample_size >= len(data):
        return data

    return random.sample(data, sample_size)

# Function to calculate F1 score
def compute_f1_score(golds, preds):
    tp = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 1)
    fp = sum(1 for g, p in zip(golds, preds) if g == 0 and p == 1)
    fn = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 0)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1score

# Load the dataset
run = 2
file_path = '/kaggle/input/e-care/ecare_' + str(run) + '.json'
responses_file = '/kaggle/working/llama2_responses_' + str(run) + '.jsonl'
dataset = load_ecare_data(file_path)
total_items = len(dataset)
preds, golds = [], []

# Generate predictions and collect gold labels
try:
    with open(responses_file, 'r', encoding='utf-8') as file:
        saved_responses = [json.loads(line) for line in file]
    print("Loaded saved responses.")
except FileNotFoundError:
    saved_responses = []
    for idx, item in enumerate(dataset):
        print(f"\nProcessing item {idx + 1}/{total_items}...")
        # query = f"{item['premise']} {item['hypothesis1']} or {item['hypothesis2']}?"
        task = "cause" if item["ask-for"] == "cause" else "effect"
        query = f"Given the premise {item['premise']}; what is the {task}?: {item['hypothesis1']} OR {item['hypothesis2']}? Answer:"
#         query = f"Which is more likely to be the {task} for this {item['premise']} - {item['hypothesis1']}(0). or {item['hypothesis2']}(1). Select the right statement. Do not explain yourself."
        # query = f"Given the premise {item['premise']} and two possible hypotheses: {item['hypothesis1']} AND {item['hypothesis2']}, which one is the most probable? Pick one verbatim and do not explain yourself."
        response = model_pipeline(query, do_sample=True, top_k=10, top_p=0.9, temperature=0.2, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=100)
        generated_text = response[0]['generated_text']
        # Sometimes the model repeats the question
        if len(generated_text) > len(query):
            generated_text = generated_text[len(query):]
        print(f"Generated Response: {generated_text}")
        correct_hypothesis = item['hypothesis1'] if item['label'] == 0 else item['hypothesis2']
        print(f"Correct Hypothesis: hypothesis {item['label'] + 1}, {correct_hypothesis}")

        saved_responses.append({
            'index': item['index'],
            'premise': item['premise'],
            'hypothesis1': item['hypothesis1'],
            'hypothesis2': item['hypothesis2'],
            'label': item['label'],
            'generated_response': generated_text
        })
        golds.append(item['label'])

        if is_response_correct(generated_text, correct_hypothesis):
            print(f"Item {idx}: Correct")
            preds.append(item['label'])
        else:
            print(f"Item {idx}: Incorrect")
            preds.append(1 - item['label'])  # Assuming binary labels (0 and 1)

    # Save the generated responses
    with open(responses_file, 'w', encoding='utf-8') as file:
        for response in saved_responses:
            file.write(json.dumps(response) + '\n')
    print("Responses saved.")

for idx, item in enumerate(saved_responses, 1):
    correct_hypothesis = item['hypothesis1'] if item['label'] == 0 else item['hypothesis2']


# Calculate F1 score and accuracy
f1score = compute_f1_score(golds, preds)
accuracy = sum(1 for g, p in zip(golds, preds) if g == p) / len(golds)

# Print the results
print(f"F1 Score: {f1score:.2f}")
print(f"Accuracy: {accuracy:.2f}")

print('---')

sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(golds, preds)
print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
print(f"F1: {f1val * 100:.3f}%")
print(f"Precision: {precisionval * 100:.3f}%")
print(f"Recall: {recallval * 100:.3f}%")

In [None]:
def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return [accuracy, f1, precision, recall]

# Check if the response correctly aligns with a given hypothesis
def is_response_correct(response_text, correct_hypothesis):
    response_text = response_text.lower()
    correct_hypothesis_lower = correct_hypothesis.lower()
    key_phrases = correct_hypothesis_lower.split()

    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    return match_ratio > 0.5  # Adjust this threshold as needed

# Load the ecare data from a JSONL file and randomly sample 1000 data points
def load_ecare_data(file_path, sample_size=100000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]

    if sample_size >= len(data):
        return data

    return random.sample(data, sample_size)

# Function to calculate F1 score
def compute_f1_score(golds, preds):
    tp = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 1)
    fp = sum(1 for g, p in zip(golds, preds) if g == 0 and p == 1)
    fn = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 0)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1score

# Load the dataset
run = 3
file_path = '/kaggle/input/e-care/ecare_' + str(run) + '.json'
responses_file = '/kaggle/working/llama2_responses_' + str(run) + '.jsonl'
dataset = load_ecare_data(file_path)
total_items = len(dataset)
preds, golds = [], []

# Generate predictions and collect gold labels
try:
    with open(responses_file, 'r', encoding='utf-8') as file:
        saved_responses = [json.loads(line) for line in file]
    print("Loaded saved responses.")
except FileNotFoundError:
    saved_responses = []
    for idx, item in enumerate(dataset):
        print(f"\nProcessing item {idx + 1}/{total_items}...")
        # query = f"{item['premise']} {item['hypothesis1']} or {item['hypothesis2']}?"
        task = "cause" if item["ask-for"] == "cause" else "effect"
        query = f"Given the premise {item['premise']}; what is the {task}?: {item['hypothesis1']} OR {item['hypothesis2']}? Answer:"
#         query = f"Which is more likely to be the {task} for this {item['premise']} - {item['hypothesis1']}(0). or {item['hypothesis2']}(1). Select the right statement. Do not explain yourself."
        # query = f"Given the premise {item['premise']} and two possible hypotheses: {item['hypothesis1']} AND {item['hypothesis2']}, which one is the most probable? Pick one verbatim and do not explain yourself."
        response = model_pipeline(query, do_sample=True, top_k=10, top_p=0.9, temperature=0.2, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=100)
        generated_text = response[0]['generated_text']
        # Sometimes the model repeats the question
        if len(generated_text) > len(query):
            generated_text = generated_text[len(query):]
        print(f"Generated Response: {generated_text}")
        correct_hypothesis = item['hypothesis1'] if item['label'] == 0 else item['hypothesis2']
        print(f"Correct Hypothesis: hypothesis {item['label'] + 1}, {correct_hypothesis}")

        saved_responses.append({
            'index': item['index'],
            'premise': item['premise'],
            'hypothesis1': item['hypothesis1'],
            'hypothesis2': item['hypothesis2'],
            'label': item['label'],
            'generated_response': generated_text
        })
        golds.append(item['label'])

        if is_response_correct(generated_text, correct_hypothesis):
            print(f"Item {idx}: Correct")
            preds.append(item['label'])
        else:
            print(f"Item {idx}: Incorrect")
            preds.append(1 - item['label'])  # Assuming binary labels (0 and 1)

    # Save the generated responses
    with open(responses_file, 'w', encoding='utf-8') as file:
        for response in saved_responses:
            file.write(json.dumps(response) + '\n')
    print("Responses saved.")

for idx, item in enumerate(saved_responses, 1):
    correct_hypothesis = item['hypothesis1'] if item['label'] == 0 else item['hypothesis2']


# Calculate F1 score and accuracy
f1score = compute_f1_score(golds, preds)
accuracy = sum(1 for g, p in zip(golds, preds) if g == p) / len(golds)

# Print the results
print(f"F1 Score: {f1score:.2f}")
print(f"Accuracy: {accuracy:.2f}")

print('---')

sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(golds, preds)
print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
print(f"F1: {f1val * 100:.3f}%")
print(f"Precision: {precisionval * 100:.3f}%")
print(f"Recall: {recallval * 100:.3f}%")