In [1]:
!pip install openai

Collecting openai
  Downloading openai-1.12.0-py3-none-any.whl.metadata (18 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)
Downloading openai-1.12.0-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: httpcore, httpx, openai
Successfully installed httpcore-1.0.2 httpx-0.26.0 openai-1.12.0


In [2]:
from kaggle_secrets import UserSecretsClient
secret_label = "openai_api"
secret_value = UserSecretsClient().get_secret(secret_label)

In [3]:
import openai
from openai import OpenAI

# Set up the OpenAI API key
openai.api_key = secret_value #use your open-ai key

client = OpenAI(
    api_key=openai.api_key
)

In [4]:
run = 3

In [5]:
import json
import random


def load_copa_data(file_path, sample_size=100000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]

    if sample_size >= len(data):
        return data

    return random.sample(data, sample_size)

# Format the COPA query for the GPT-3.5 Turbo model
def format_copa_query(item):
    premise = item['premise']
    hypothesis1 = item['hypothesis1']
    hypothesis2 = item['hypothesis2']
    ask_for = item['ask-for']
    task = "cause" if ask_for == "cause" else "effect"
#     return f"What is the most plausible {task} this situation? '{premise}' Is it because {hypothesis1} or {hypothesis2}?"
    return f"Given the premise: {premise}; what is the {task}? Is it hypothesis 1: {hypothesis1}; OR hypothesis 2: {hypothesis2}? Pick one and do not explain."

# Send the query to the GPT-3.5 Turbo model and get the response
def query_gpt3_5_turbo(query):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant capable of causal discovery."},
            {"role": "user", "content": query}
        ]
    )
    #return response['choices'][0]['message']['content']
    return response

saved_responses = []

def is_response_correct(response, correct_hypothesis, item):
    # Extract the text content from the response object
    response_text = response.choices[0].message.content.lower()  # Adjust according to the actual structure
    print(f'Generated Response: {response_text}')
    correct_hypothesis_lower = correct_hypothesis.lower()
    
    saved_responses.append({
            'index': item['index'],
            'premise': item['premise'],
            'task': item['ask-for'],
            'hypothesis1': item['hypothesis1'],
            'hypothesis2': item['hypothesis2'],
            'label': item['label'],
            'generated_response': response_text
        })

    # Example: Check for the presence of key phrases or words
    key_phrases = [phrase.lower() for phrase in correct_hypothesis_lower.split()]

    # Check if a significant number of key phrases are present in the response
    matching_phrases = [phrase for phrase in key_phrases if phrase in response_text]
    match_ratio = len(matching_phrases) / len(key_phrases)

    return match_ratio > 0.5

# Load the ECare data
copa_data = load_copa_data('/kaggle/input/e-care/ecare_' + str(run) + '.json')

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return [accuracy, f1, precision, recall]


def evaluate_accuracy(data):
    correct_count = 0
    total = len(data)
    for idx, item in enumerate(data, 1):
        print(f'Processing item: {idx}/{total}')
        query = format_copa_query(item)
        response_object = query_gpt3_5_turbo(query)
        time.sleep(0.5)
        correct_hypothesis = item['hypothesis1'] if item['label'] == 0 else item['hypothesis2']
        print(f"Correct Hypothesis: Hypothesis: {item['label'] + 1}, {correct_hypothesis}")
        acts.append(item['label'])
        # Use the function to check if the response aligns with the correct hypothesis
        if is_response_correct(response_object, correct_hypothesis, item):
            print(f'Item {idx}: Correct')
            correct_count += 1
            preds.append(item['label'])
        else:
            print(f'Item {idx}: Incorrect')
            preds.append(0 if item['label'] else 1)

    return correct_count / len(data)

# Evaluate accuracy
acts, preds = [], []  # acts: ground truth
accuracy = evaluate_accuracy(copa_data)

responses_file = '/kaggle/working/ecare_responses_' + str(run) + '.jsonl'
with open(responses_file, 'w', encoding='utf-8') as file:
        for response in saved_responses:
            file.write(json.dumps(response) + '\n')
        print("Responses saved.")

sklearn_accuracy, f1val, precisionval, recallval = compute_metrics(acts, preds)
print(f"Accuracy: {accuracy * 100:.3f}%")
print(f"sklearn Accuracy: {sklearn_accuracy * 100:.3f}%")
print(f"F1: {f1val * 100:.3f}%")
print(f"Precision: {precisionval * 100:.3f}%")
print(f"Recall: {recallval * 100:.3f}%")


Processing item: 1/1000
Correct Hypothesis: Hypothesis: 2, Megakaryocytes pinched off small cell fragments.
Generated Response: the cause is hypothesis 2: megakaryocytes pinched off small cell fragments.
Item 1: Correct
Processing item: 2/1000
Correct Hypothesis: Hypothesis: 1, Linda is against superstition.
Generated Response: hypothesis 1: linda is against superstition.
Item 2: Correct
Processing item: 3/1000
Correct Hypothesis: Hypothesis: 1, He experiences acute asthma.
Generated Response: hypothesis 1: he experiences acute asthma.
Item 3: Correct
Processing item: 4/1000
Correct Hypothesis: Hypothesis: 2, Bats send out sound waves and make sense of their environment from the echoes they receive back.
Generated Response: hypothesis 2: bats send out sound waves and make sense of their environment from the echoes they receive back.
Item 4: Correct
Processing item: 5/1000
Correct Hypothesis: Hypothesis: 2, Tom felt fear for a moment.
Generated Response: hypothesis 2: tom felt fear for 