In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !pip install -U -q transformers
!pip install -U -q accelerate
!pip install -U -q bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [4]:
run = 1

In [6]:
# Load Mistral model and tokenizer
model_name = 'mistralai/Mistral-7B-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="auto",
    trust_remote_code=True,
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float32,
    device_map="auto",
    pad_token_id=tokenizer.eos_token_id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [9]:
import json
import random
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Function to load the dataset from a JSON file
def load_dataset(file_path, sample_size=100000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if sample_size >= len(data):
        return data

    return random.sample(data, sample_size)

# Function to calculate F1 score
def compute_f1_score(golds, preds):
    tp = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 1)
    fp = sum(1 for g, p in zip(golds, preds) if g == 0 and p == 1)
    fn = sum(1 for g, p in zip(golds, preds) if g == 1 and p == 0)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1score

def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    return [accuracy, f1, precision, recall]

# Check if the response correctly aligns with a given hypothesis
def is_response_correct(response_text, correct_hypothesis):
    response_text = response_text.lower()
    correct_hypothesis_lower = correct_hypothesis.lower()
    key_phrases = correct_hypothesis_lower.split()

    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    return match_ratio > 0.5  # Adjust this threshold as needed


# Load the dataset
file_path = '/content/drive/MyDrive/Datasets/causalnetcd.json'  # Update this path
dataset = load_dataset(file_path)

# File to save the generated responses
responses_file = '/content/drive/MyDrive/Datasets/mistral_causalnet_responses_' + str(run) + '.jsonl'
total_items = len(dataset)
# Generate predictions and save responses
try:
    with open(responses_file, 'r', encoding='utf-8') as file:
        saved_responses = [json.loads(line) for line in file]
    print("Loaded saved responses.")
except FileNotFoundError:
    saved_responses = []
    for index, item in enumerate(dataset):
        print(f"\nProcessing item {index + 1}/{total_items}...")
#         print(f"Item at position {index}: {item}")
        task = "cause for" if item["ask-for"] == "cause" else "effect of"
        # query = f"Which is more likely to be the {task} this {item['context']} - {item['choice_id: 0']}(0). or {item['choice_id: 1']}(1). or {item['choice_id: 2']}(2). Select the right statement."
        query = f"{item['context']}; what is the {task}?: 1. {item['choice_id: 0']} OR 2. {item['choice_id: 1']} OR 3. {item['choice_id: 2']}? Answer:"
        response = pipe(query, max_length=150)[0]['generated_text']
        if len(response) > len(query):
            response = response[len(query):]
        print(f'Generated Response: {response}')
        print(f'Correct Response: {item["choice_id: " + str(item["label"])]}')
        saved_responses.append({
            'index': item['index'],
            'premise': item['context'],
            'hypothesis1': item['choice_id: 0'],
            'hypothesis2': item['choice_id: 1'],
            'hypothesis3': item['choice_id: 2'],
            'label': item['label'],
            'generated_response': response
        })

    # Save the generated responses
    with open(responses_file, 'w', encoding='utf-8') as file:
        for response in saved_responses:
            file.write(json.dumps(response) + '\n')
    print("Responses saved.")

# Calculate predictions, F1 score, and accuracy
preds, golds = [], []
for item in saved_responses:
    correct_hypothesis = item['choice_id: ' + str(item['label'])]
    if is_response_correct(item['generated_response'], correct_hypothesis):
        print('Correct')
        preds.append(item['label'])
    else:
        print('Incorrect')
        preds.append(0 if item['label'] else 1)  # Assuming binary labels (0 and 1)
    golds.append(item['label'])

f1score = compute_f1_score(golds, preds)
accuracy = sum(1 for g, p in zip(golds, preds) if g == p) / len(golds)


print(f"F1 Score: {f1score:.2f}")
print(f"Accuracy: {accuracy:.2f}")

print('---')

sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(golds, preds)
print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
print(f"F1: {f1val * 100:.3f}%")
print(f"Precision: {precisionval * 100:.3f}%")
print(f"Recall: {recallval * 100:.3f}%")




Processing item 1/1000...
Generated Response:  2. The celebrity's influence changed teenagers' perceptions more significantly than expected.

10. A new policy in a country led to the unexpected decline in tobacco use among teenagers. At the same time
Correct Response: The celebrity's influence changed teenagers' perceptions more significantly than expected.

Processing item 2/1000...
Generated Response:  2.

10. A new law was passed that required all drivers to wear seat belts. The number of traffic accidents decreased by 20% in the following year. What is the cause for?: 1. The law was passed. OR 2. The law was passed and people started wearing seat belts. OR 3.
Correct Response: The road improved access to educational resources and opportunities.

Processing item 3/1000...


KeyboardInterrupt: 