In [1]:
!pip install -q -U google-generativeai

In [2]:
import pathlib
import textwrap

import google.generativeai as genai

# Used to securely store your API key
from google.colab import userdata

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))



In [3]:
GOOGLE_API_KEY='AIzaSyD8NxtHgBgTa8FxXvjv3kEDjR24RlaM4Is'
genai.configure(api_key=GOOGLE_API_KEY)
gemini_model = genai.GenerativeModel('gemini-pro')

In [4]:
import pandas as pd
import json

dataset = pd.read_csv("data_full_v1.csv")
dataset['ground_truth'] = dataset.apply(lambda row: 1 if row['label']=='yes' else 0, axis=1)
dataset = dataset.sample(frac=1)
dataset = json.loads(dataset.to_json(orient ='records'))

In [5]:
# Format the COPA query for the GPT-3.5 Turbo model
def format_query(item):
    # return f"{item['prompt']}. Yes or No?"
    # return f"{item['prompt']}. No or Yes?"
    return f"{item['prompt']}"

  # return f"{item['prompt']}. Output only single digit 0 or 1. 0 for no and 1 for yes. Output should have only single digit. Do not output any text other than a single digit"

format_query(dataset[0])

'Imagine a self-contained, hypothetical world with only the following conditions, and without any unmentioned factors or causal relationships: Zory has a direct effect on zuph and glimx. Jyka has a direct effect on zuph. Zuph has a direct effect on glimx. Zory is unobserved.\nThe overall probability of zuph is 43%. The probability of not zuph and glimx is 27%. The probability of zuph and glimx is 17%.\nIs the chance of glimx smaller when observing zuph?'

In [6]:
def query_gemini(query, model):
    chat = model.start_chat(history=[])
    response = chat.send_message(query)
    return response.text

In [7]:
query_gemini(format_query(dataset[0]), gemini_model)

'Yes, the chance of glimx is smaller when observing zuph.\n\nGiven the probability of Zuph (P(Z)) is 43%, the probability of not zuph (¬Z) is 1 - P(Z) = 57%.\n\nThe probability of glimx given zuph (P(G|Z)) can be calculated as follows:\nP(G|Z) = P(Z and G) / P(Z)\n= 0.17 / 0.43\n= 0.4\n\nThe probability of glimx given not zuph (P(G|¬Z)) can be calculated as follows:\nP(G|¬Z) = P(¬Z and G) / P(¬Z)\n= (0.27 - 0.17) / 0.57\n= 0.175\n\nTherefore, the chance of glimx when observing zuph (P(G|Z) = 0.4) is smaller than the chance of glimx when not observing zuph (P(G|¬Z) = 0.175).'

In [8]:
from tenacity import retry, stop_after_attempt, wait_random_exponential, RetryError
import re

correct_count = 0
processed_items = 0
processed = []
failed_items = []
y_true = []
y_pred = []

# Evaluate the accuracy of GPT-3.5 Turbo on the COPA dataset
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(1))
def evaluate_accuracy(item):
      global correct_count
      query = format_query(item)

      # print('may1')

      chat = gemini_model.start_chat(history=[])
      # print('may2')
      # print('query ', query)
      generated_text = chat.send_message(query).text
      # print('may3')

      generated_text = generated_text.removeprefix(query).lower()
      # print('may4')

      # print(generated_text)
      generated_text = [w for w in re.split('\W', generated_text) if w]
#       print(generated_text)
      # generated_text = ''.join(filter(str.isalnum, generated_text)).split()

      if (('yes' in generated_text) or ('yes.' in generated_text)) and ((not 'no' in generated_text) and (not 'no.' in generated_text)):
        response = 1
      elif (('no' in generated_text) or ('no.' in generated_text)) and ((not 'yes' in generated_text) and (not 'yes.' in generated_text)):
        response = 0
      else:
#         print(query)
        # print(response)
        print(generated_text)
        raise Exception("failure")

      # response = pred[-1]
      print(response)
      # print(item['ground_truth'])
      correct_hypothesis = ((int(response) == item['ground_truth'])==True)
      y_true.append(item['ground_truth'])
      y_pred.append(int(response))
      # print(correct_hypothesis)
      correct_count += correct_hypothesis
      print("correct_count is ", correct_count)
      # print("completed\n")


# Evaluate accuracy
def evaluate_accuracy_wrapper(data):
      global correct_count
      global processed_items
      global failed_items
      for item in data:
          try:
              evaluate_accuracy(item)
              processed_items = processed_items +1
              processed.append(item)
          except RetryError as e:
              print("failed")
              failed_items.append(item)
          if processed_items==1000:
            break
          if processed_items%10==0:
            print('processed_items ', processed_items)
      print("correct_count in wrapper is ", correct_count)
      return correct_count / len(data)

accuracy = evaluate_accuracy_wrapper(dataset)
print(f"Accuracy: {accuracy * 100:.2f}%")

1
correct_count is  1
1
correct_count is  1
1
correct_count is  2
['the', 'information', 'provided', 'does', 'not', 'allow', 'for', 'a', 'definitive', 'conclusion', 'on', 'whether', 'yupt', 'negatively', 'affects', 'muvq', 'through', 'zupj', 'and', 'xyfo', 'the', 'provided', 'information', 'establishes', 'that', 'yupt', 'has', 'a', 'direct', 'effect', 'on', 'xyfo', 'and', 'zupj', 'zupj', 'has', 'a', 'direct', 'effect', 'on', 'muvq', 'and', 'xyfo', 'has', 'a', 'direct', 'effect', 'on', 'muvq', 'however', 'it', 'does', 'not', 'specify', 'the', 'nature', 'or', 'strength', 'of', 'these', 'effects', 'nor', 'does', 'it', 'provide', 'any', 'information', 'about', 'the', 'potential', 'indirect', 'effects', 'of', 'yupt', 'on', 'muvq', 'through', 'zupj', 'and', 'xyfo', 'additionally', 'the', 'provided', 'information', 'does', 'not', 'account', 'for', 'the', 'potential', 'influence', 'of', 'other', 'factors', 'that', 'may', 'also', 'affect', 'the', 'probability', 'of', 'muvq', 'therefore', 'it', 

In [9]:
with open("y_true.txt", "w") as f:
    for s in y_true:
        f.write(str(s) +"\n")

In [10]:
with open("y_pred.txt", "w") as f:
    for s in y_pred:
        f.write(str(s) +"\n")

In [11]:
with open("processed.txt", "w") as f:
    for s in processed:
        f.write(str(s) +"\n")

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.41      0.50       496
           1       0.57      0.76      0.65       504

    accuracy                           0.59      1000
   macro avg       0.60      0.59      0.58      1000
weighted avg       0.60      0.59      0.58      1000

