# Naive GPT-4o-mini

## Preparation

In [1]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.55.0-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.55.0-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.5/389.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.4:
      Successfully uninstalled openai-1.54.4
Successfully installed openai-1.55.0


In [2]:
import time
import json
import os
from openai import OpenAI
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

In [3]:
if not os.path.exists("data"):
  !pip install github-clone
  !ghclone https://github.com/yiw008/nondet-project/tree/main/data

Collecting github-clone
  Downloading github_clone-1.2.0-py3-none-any.whl.metadata (3.7 kB)
Collecting docopt>=0.6.2 (from github-clone)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading github_clone-1.2.0-py3-none-any.whl (9.1 kB)
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=c97f0826c59144f8a0ca61f240080d9d794b16348671c339f8e7f72b162ea7f0
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt, github-clone
Successfully installed docopt-0.6.2 github-clone-1.2.0
Cloning into 'data'...
done.


In [4]:
def str_to_bool(string):
  if string == "True":
    return True
  elif string == "False":
    return False
  return False

In [5]:
api_key = "" # TODO
os.environ['OPENAI_API_KEY'] = api_key

In [6]:
client = OpenAI()
our_model = "gpt-4o-mini-2024-07-18"

In [19]:
def test_project(project_name):
  print(f"Test project: {project_name}")
  test_set = []
  y_test = []
  with open(f"data/{project_name}/test_set.jsonl", "r") as file:
    for line in file:
      data = json.loads(line)
      test_set.append(data['messages'])
      y_test.append(str_to_bool(data['messages'][2]['content']))

  y_pred = []
  start = time.time()

  for i in range(len(test_set)):
    completion = client.chat.completions.create(
      model=our_model,
      messages=test_set[i]
    )
    y_pred.append(str_to_bool(completion.choices[0].message.content))

  end = time.time()
  print(f"Testing time: {end - start:.4f} seconds")

  print("Y_test:")
  print(y_test)
  print("Y_pred:")
  print(y_pred)

  confusion_matrix_res = confusion_matrix(y_test, y_pred, labels=[False, True])
  print("Confusion Matrix:")
  print(confusion_matrix_res)

  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy}")

  precision = precision_score(y_test, y_pred)
  print(f"Precision: {precision}")

  f1 = f1_score(y_test, y_pred)
  print(f"F1 Score: {f1}")

  recall = recall_score(y_test, y_pred)
  print(f"Recall: {recall}")

## Let's Go

In [20]:
test_project("Butter.MAS.PythonAPI")

Test project: Butter.MAS.PythonAPI
Testing time: 23.7184 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Y_pred:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Confusion Matrix:
[[ 0  0]
 [ 0 53]]
Accuracy: 1.0
Precision: 1.0
F1 Score: 1.0
Recall: 1.0


In [21]:
test_project("centreon-sdk-python")

Test project: centreon-sdk-python
Testing time: 18.4172 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Y_pred:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Confusion Matrix:
[[ 0  0]
 [ 0 41]]
Accuracy: 1.0
Precision: 1.0
F1 Score: 1.0
Recall: 1.0


In [22]:
test_project("cloudnetpy")

Test project: cloudnetpy
Testing time: 27.5286 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Y_pred:
[True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Confusion Matrix:
[[ 0  0]
 [ 1 50]]
Accuracy: 0.9803921568627451
Precision: 1.0
F1 Score: 0.9900990099009901
Recall: 0.9803921568627451


In [24]:
test_project("crom")

Test project: crom
Testing time: 16.2622 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Y_pred:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Confusion Matrix:
[[ 0  0]
 [ 2 36]]
Accuracy: 0.9473684210526315
Precision: 1.0
F1 Score: 0.972972972972973
Recall: 0.9473684210526315


In [25]:
test_project("easypy")

Test project: easypy
Testing time: 22.8664 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Y_pred:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True]
Confusion Matrix:
[[ 0  0]
 [ 1 43]]
Accuracy: 0.9772727272727273
Precision: 1.0
F1 Score: 0.9885057471264368
Recall: 0.9772727272727273


In [26]:
test_project("eppy")

Test project: eppy
Testing time: 34.0420 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Y_pred:
[True, True, True, True, True, True, True, True, False, False, False, False, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, False, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, T

In [23]:
test_project("reframe")

Test project: reframe
Testing time: 86.8873 seconds
Y_test:
[True, True, True, True, True, True, True, True, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, False, True, False, True, True, False, True, True, True, False, False, False, True, True, True, True, True, True, False, True, True, True, True, True, True, False, False, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, False, True, True, True, True, True, True, False, True, True, True, True, True, False, True, True, True, False, True, True, True, True, True, Tru

In [27]:
test_project("region_cache")

Test project: region_cache
Testing time: 20.0905 seconds
Y_test:
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
Y_pred:
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False]
Confusion Matrix:
[[36  1]
 [ 0  0]]
Accuracy: 0.972972972972973
Precision: 0.0
F1 Score: 0.0
Recall: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
test_project("taxi")

Test project: taxi
Testing time: 32.3331 seconds
Y_test:
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
Y_pred:
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fa

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
test_project("wkr.py")

Test project: wkr.py
Testing time: 14.0638 seconds
Y_test:
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
Y_pred:
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
Confusion Matrix:
[[34  0]
 [ 0  0]]
Accuracy: 1.0
Precision: 0.0
F1 Score: 0.0
Recall: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
