In [1]:
!pip install openai

Collecting openai
  Downloading openai-1.12.0-py3-none-any.whl.metadata (18 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)
Downloading openai-1.12.0-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: httpcore, httpx, openai
Successfully installed httpcore-1.0.2 httpx-0.26.0 openai-1.12.0


In [2]:
from kaggle_secrets import UserSecretsClient
secret_label = "openai_api"
secret_value = UserSecretsClient().get_secret(secret_label)

In [3]:
import openai
from openai import OpenAI

# Set up the OpenAI API key
openai.api_key = secret_value #use your open-ai key

client = OpenAI(
    api_key=openai.api_key
)

In [52]:
run = 2

In [53]:
import json
import random


def load_copa_data(file_path, sample_size=100000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if sample_size >= len(data):
        return data

    return random.sample(data, sample_size)

# Format the COPA query for the GPT-3.5 Turbo model
def format_copa_query(item):
    premise = item['context']
    hypothesis1 = item['choice_id: 0']
    hypothesis2 = item['choice_id: 1']
    hypothesis3 = item['choice_id: 2']
    ask_for = item['ask-for']
    task = "cause" if ask_for == "cause" else "effect"
#     return f"What is the most plausible {task} this situation? '{context}' Is it because {hypothesis1} or {hypothesis2}?"
    return f"Given the premise: {premise}; what is the {task}? Is it hypothesis 1: {hypothesis1}; OR hypothesis 2: {hypothesis2}; OR hypothesis 3: {hypothesis3}? Pick one and do not explain."

# Send the query to the GPT-3.5 Turbo model and get the response
def query_gpt3_5_turbo(query):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant capable of causal discovery."},
            {"role": "user", "content": query}
        ]
    )
    #return response['choices'][0]['message']['content']
    return response

saved_responses = []

def is_response_correct(response, correct_hypothesis, item):
    # Extract the text content from the response object
    response_text = response.choices[0].message.content.lower()  # Adjust according to the actual structure
    print(f'Generated Response: {response_text}')
    correct_hypothesis_lower = correct_hypothesis.lower()
    
    saved_responses.append({
            'index': item['index'],
            'premise': item['context'],
            'task': item['ask-for'],
            'hypothesis1': item['choice_id: 0'],
            'hypothesis2': item['choice_id: 1'],
            'hypothesis3': item['choice_id: 2'],
            'label': item['label'],
            'generated_response': response_text
        })

    # Example: Check for the presence of key phrases or words
    key_phrases = [phrase.lower() for phrase in correct_hypothesis_lower.split()]

    # Check if a significant number of key phrases are present in the response
    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    return match_ratio > 0.5

def is_response_correct_loaded(response_text, correct_hypothesis, item):
    # Extract the text content from the response object
#     print(f'Generated Response: {response_text}')
#     hyp = 'hypothesis' + str(item['label'] + 1)
#     print(f'Correct: {item["label"]}, {item[hyp]}')
    correct_hypothesis_lower = correct_hypothesis.lower()

    # Example: Check for the presence of key phrases or words
    key_phrases = [phrase.lower() for phrase in correct_hypothesis_lower.split()]

    # Check if a significant number of key phrases are present in the response
    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    return match_ratio > 0.5

# Load the ECare data
copa_data = load_copa_data('/kaggle/input/causalnetcd/causalnetcd.json')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    return [accuracy, f1, precision, recall]

responses_file = '/kaggle/working/causalnetcd_responses_' + str(run) + '.jsonl'

def evaluate_accuracy(data):
    correct_count = 0
    total = len(data)
    try:
        with open(responses_file, 'r', encoding='utf-8') as file:
            saved_responses = [json.loads(line) for line in file]
        print("Loaded saved responses.")
        return saved_responses
    except FileNotFoundError:
        for idx, item in enumerate(data, 1):
            print(f'Processing item: {idx}/{total}')
            query = format_copa_query(item)
            response_object = query_gpt3_5_turbo(query)
            time.sleep(0.5)
            correct_hypothesis = item['choice_id: ' + str(item['label'])]
            print(f"Correct Hypothesis: Hypothesis: {item['label'] + 1}, {correct_hypothesis}")
            golds.append(item['label'])
            # Use the function to check if the response aligns with the correct hypothesis
            if is_response_correct(response_object, correct_hypothesis, item):
                print(f'Item {idx}: Correct')
                correct_count += 1
                preds.append(item['label'])
            else:
                print(f'Item {idx}: Incorrect')
                preds.append(0 if item['label'] else 1)

        return correct_count / len(data)

# Evaluate accuracy

golds, preds = [], []  # acts: ground truth

# saved_responses = evaluate_accuracy(copa_data)
# preds, golds = [], []
# for idx, item in enumerate(saved_responses):
#     correct_hypothesis = item['hypothesis' + str(item['label'] + 1)]
#     pred_response = item['generated_response']
#     if is_response_correct_loaded(pred_response, correct_hypothesis, item):
#         print(f'Item {idx}: Correct')
#         preds.append(item['label'])
#     else:
#         print(f'Item {idx}: Incorrect')
        
#         preds.append(pred_hyp)  # Assuming binary labels (0 and 1)
#     golds.append(item['label'])

with open(responses_file, 'w', encoding='utf-8') as file:
        for response in saved_responses:
            file.write(json.dumps(response) + '\n')
        print("Responses saved.")

accuracy = evaluate_accuracy(copa_data)
sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(golds, preds)
print(f"Accuracy: {accuracy * 100:.3f}%")
print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
print(f"F1: {f1val * 100:.3f}%")
print(f"Precision: {precisionval * 100:.3f}%")
print(f"Recall: {recallval * 100:.3f}%")


Processing item: 1/1000
Correct Hypothesis: Hypothesis: 2, The celebrity's influence changed teenagers' perceptions more significantly than expected.
Generated Response: hypothesis 3: online educational content was the sole reason for this decline.
Item 1: Incorrect
Processing item: 2/1000
Correct Hypothesis: Hypothesis: 2, The road improved access to educational resources and opportunities.
Generated Response: hypothesis 2: the road improved access to educational resources and opportunities.
Item 2: Correct
Processing item: 3/1000
Correct Hypothesis: Hypothesis: 3, Improved agricultural efficiency made rural living more sustainable, reducing the need for urban migration.
Generated Response: hypothesis 3: improved agricultural efficiency made rural living more sustainable, reducing the need for urban migration.
Item 3: Correct
Processing item: 4/1000
Correct Hypothesis: Hypothesis: 2, The rise in public transportation usage reduced the number of cars on the road.
Generated Response: hy