# Naive GPT-4o-mini

## Preparation

In [None]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.55.3-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.55.3-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.4:
      Successfully uninstalled openai-1.54.4
Successfully installed openai-1.55.3


In [None]:
import time
import json
import os
from openai import OpenAI
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

In [None]:
if not os.path.exists("data"):
  !pip install github-clone
  !ghclone https://github.com/yiw008/nondet-project/tree/main/data

Collecting github-clone
  Downloading github_clone-1.2.0-py3-none-any.whl.metadata (3.7 kB)
Collecting docopt>=0.6.2 (from github-clone)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading github_clone-1.2.0-py3-none-any.whl (9.1 kB)
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=d06029fe79b7029befea440e4bd445d1ce620ad99992c1f52541b54264cdd673
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt, github-clone
Successfully installed docopt-0.6.2 github-clone-1.2.0
Cloning into 'data'...
done.


In [None]:
def str_to_bool(string):
  if string == "True":
    return True
  elif string == "False":
    return False
  return False

In [None]:
api_key = "" # TODO
os.environ['OPENAI_API_KEY'] = api_key

In [None]:
client = OpenAI()
our_model = "gpt-4o-mini-2024-07-18"

In [None]:
def test_project(project_name):
  print(f"Test project: {project_name}")
  test_set = []
  y_test = []
  with open(f"data/{project_name}/test_set.jsonl", "r") as file:
    for line in file:
      data = json.loads(line)
      test_set.append(data['messages'])
      y_test.append(str_to_bool(data['messages'][2]['content']))

  y_pred = []
  start = time.time()

  for i in range(len(test_set)):
    completion = client.chat.completions.create(
      model=our_model,
      messages=test_set[i]
    )
    y_pred.append(str_to_bool(completion.choices[0].message.content))

  end = time.time()
  print(f"Testing time: {end - start:.4f} seconds")

  print("Y_test:")
  print(y_test)
  print("Y_pred:")
  print(y_pred)

  confusion_matrix_res = confusion_matrix(y_test, y_pred, labels=[False, True])
  print("Confusion Matrix:")
  print(confusion_matrix_res)

  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy}")

  precision = precision_score(y_test, y_pred)
  print(f"Precision: {precision}")

  f1 = f1_score(y_test, y_pred)
  print(f"F1 Score: {f1}")

  recall = recall_score(y_test, y_pred)
  print(f"Recall: {recall}")

  return accuracy, precision, f1, recall

## Let's Go

In [None]:
accuracy_values = [0] * 10
precision_values = [0] * 10
f1_values = [0] * 10
recall_values = [0] * 10

In [None]:
accuracy_values[0], precision_values[0], f1_values[0], recall_values[0] = test_project("Butter.MAS.PythonAPI")

Test project: Butter.MAS.PythonAPI
Testing time: 26.0813 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, False, True, True, True, False, True, False, False, False, True, True, False, True, True, True, False, False, True, False]
Y_pred:
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, False, True, True, True, False, True, False, False, False, True, True, False, True, True, True, False, False, True, False]
Confusion Matrix:
[[11  0]
 [ 0 53]]
Accuracy: 1.0
Precision: 1.0
F1 Score: 1.0
Recall: 1.0


In [None]:
accuracy_values[1], precision_values[1], f1_values[1], recall_values[1] = test_project("flask-multi-redis")

Test project: flask-multi-redis
Testing time: 13.7050 seconds
Y_test:
[True, True, True, True, True, True, True, True, True, False, True, True, False, True, True, True, False, False, False, True, True, True, True, False, True, True, True, True, False, True, True, True, True]
Y_pred:
[False, True, True, True, True, True, False, True, True, False, True, True, False, True, True, True, False, False, False, False, True, True, True, False, True, True, True, True, False, True, True, True, True]
Confusion Matrix:
[[ 7  0]
 [ 3 23]]
Accuracy: 0.9090909090909091
Precision: 1.0
F1 Score: 0.9387755102040817
Recall: 0.8846153846153846


In [None]:
accuracy_values[2], precision_values[2], f1_values[2], recall_values[2] = test_project("centreon-sdk-python")

Test project: centreon-sdk-python
Testing time: 18.2732 seconds
Y_test:
[True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Y_pred:
[True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
Confusion Matrix:
[[ 1  0]
 [ 0 41]]
Accuracy: 1.0
Precision: 1.0
F1 Score: 1.0
Recall: 1.0


In [None]:
accuracy_values[3], precision_values[3], f1_values[3], recall_values[3] = test_project("cloudnetpy")

Test project: cloudnetpy
Testing time: 195.5107 seconds
Y_test:
[False, True, True, True, True, True, False, True, True, False, False, False, False, False, True, False, False, False, True, True, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, True, True, False, False, False, False, True, True, False, True, False, False, False, False, False, True, False, False, False, False, True, True, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, True, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, T

In [None]:
accuracy_values[4], precision_values[4], f1_values[4], recall_values[4] = test_project("crom")

Test project: crom
Testing time: 35.8987 seconds
Y_test:
[True, False, False, False, True, False, True, False, True, False, True, False, False, False, True, False, False, True, True, False, True, False, False, False, True, True, False, False, True, True, False, False, False, True, True, False, True, True, True, False, True, False, True, False, True, True, False, False, True, True, False, False, False, True, True, False, False, True, True, False, False, False, True, False, False, True, False, False, False, False, False, False, False, True, True, True, False, True, True, True, False, True, False, True]
Y_pred:
[True, False, False, False, False, False, True, False, True, False, True, False, False, False, True, False, False, True, True, False, True, False, False, False, True, True, False, False, True, True, False, False, False, True, False, False, True, False, True, False, True, False, True, False, True, True, False, False, True, True, False, False, False, True, True, False, False, True, T

In [None]:
accuracy_values[5], precision_values[5], f1_values[5], recall_values[5] = test_project("easypy")

Test project: easypy
Testing time: 134.3147 seconds
Y_test:
[False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, True, True, False, False, True, False, True, False, False, True, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, True, False, True, False, False, False, True, True, False, True, True, False, False, True, True, False, False, False, True, True, False, False, False, True, False, False, False, True, False, False, False, True, True, False, False, False, True, False, False, False, False, False, False, True, False, False, False, True, False, False, True, False, True, False, False, False, False, True, False, False, False, False, False, False, True, False, True, True, False, True, False, False, False, False, False, False, False, False, False, False, False, F

In [None]:
accuracy_values[6], precision_values[6], f1_values[6], recall_values[6] = test_project("eppy")

Test project: eppy
Testing time: 93.7987 seconds
Y_test:
[True, True, False, True, False, True, False, True, False, False, False, True, False, False, True, False, False, True, False, True, True, False, False, False, False, False, True, False, False, False, True, False, False, True, True, True, True, True, False, False, False, True, True, True, False, False, False, True, False, False, True, True, False, False, True, True, False, True, True, False, True, True, False, False, False, False, False, False, True, True, True, False, True, False, True, False, False, False, True, True, False, False, True, False, True, False, False, True, False, True, False, True, False, False, False, True, True, False, True, False, True, True, True, False, False, False, False, False, True, False, True, True, False, True, True, False, False, False, True, True, True, False, True, False, False, True, True, False, False, False, False, False, False, False, False, True, False, False, False, False, True, True, True, Tru

In [None]:
accuracy_values[7], precision_values[7], f1_values[7], recall_values[7] = test_project("pykicad")

Test project: pykicad
Testing time: 21.3315 seconds
Y_test:
[False, False, False, True, False, True, True, True, False, True, True, False, True, True, False, False, False, False, True, True, True, True, True, True, False, True, False, True, False, False, True, True, True, False, False, False, True, True, True, False, True, True, True, True, True, True, False, False]
Y_pred:
[False, False, False, True, False, False, True, True, False, False, False, False, False, False, False, False, False, False, True, False, True, True, True, True, False, True, False, True, False, False, True, True, False, False, False, False, True, False, True, False, False, True, True, True, True, True, False, False]
Confusion Matrix:
[[20  0]
 [ 9 19]]
Accuracy: 0.8125
Precision: 1.0
F1 Score: 0.8085106382978723
Recall: 0.6785714285714286


In [None]:
accuracy_values[8], precision_values[8], f1_values[8], recall_values[8] = test_project("reframe")

Test project: reframe
Testing time: 318.7577 seconds
Y_test:
[False, True, False, False, False, False, True, False, True, False, False, False, False, False, False, False, True, True, False, True, False, False, False, False, False, False, False, False, True, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, True, False, False, False, False, False, False, False, True, False, True, True, False, False, False, False, False, True, True, True, True, True, False, False, False, False, False, False, False, False, False, True, False, True, False, True, False, True, False, True, True, True, False, False, False, False, False, False, False, False, False, False, False, True, True, False, True, False, False, False, False, False, True, False, True, False, False, False, True, False, True, True, False, False, False, False, True, True, False, False, False, True, False, False, False, False, Fal

In [None]:
accuracy_values[9], precision_values[9], f1_values[9], recall_values[9] = test_project("webssh")

Test project: webssh
Testing time: 34.8538 seconds
Y_test:
[True, False, False, True, False, False, False, False, True, True, False, False, True, True, False, True, False, False, True, False, False, False, False, False, True, False, True, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, True, False, False, True, False, True, True, False, False, False, False, False, False, True, False, False, True, False, False, False, True, True, False, True, False, False, False, False, False, False, True, False, False, False, True, True, False, True, False, True]
Y_pred:
[True, False, True, True, False, False, False, False, True, True, False, False, True, True, False, True, False, False, True, False, False, True, False, False, True, False, True, True, False, True, False, False, False, True, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, True, False,

In [None]:
avg_accuracy = sum(accuracy_values) / len(accuracy_values)
avg_precision = sum(precision_values) / len(precision_values)
avg_f1 = sum(f1_values) / len(f1_values)
avg_recall = sum(recall_values) / len(recall_values)

print(f"Average Accuracy: {avg_accuracy}")
print(f"Average Precision: {avg_precision}")
print(f"Average F1 Score: {avg_f1}")
print(f"Average Recall: {avg_recall}")

Average Accuracy: 0.9510782469966192
Average Precision: 0.9723057160566737
Average F1 Score: 0.9428149065932324
Average Recall: 0.921176531734875
