<a href="https://colab.research.google.com/github/yiw008/nondet-project/blob/main/Go_Through_IPFlakies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import requests
import csv
import random
from copy import deepcopy
import json

In [2]:
# URL of the CSV file: iPFlakies
url = "https://zenodo.org/records/6176417/files/Test_Status.csv"

In [3]:
# Read the CSV file from the URL
data = pd.read_csv(url)

In [4]:
# Display the first few rows of the data
print(data.head())

  Project_Name                               Project_URL  \
0   BT-Tracker  https://github.com/nordwind80/BT-Tracker   
1      Breathe         https://github.com/mrob95/Breathe   
2      Breathe         https://github.com/mrob95/Breathe   
3      Breathe         https://github.com/mrob95/Breathe   
4      Breathe         https://github.com/mrob95/Breathe   

                               Project_Hash  \
0  558c15b399871c1ca11d0c4ae1eb598e3060931e   
1  4600818e24f4156cd7bb8cc0f43886b27323968e   
2  4600818e24f4156cd7bb8cc0f43886b27323968e   
3  4600818e24f4156cd7bb8cc0f43886b27323968e   
4  4600818e24f4156cd7bb8cc0f43886b27323968e   

                                             Test_id  Detected  Have_Patch  \
0  Tracker/tests/test_event.py::TestEvent::test_o...      True        True   
1  tests/test_command_context.py::test_manual_con...      True       False   
2                tests/test_loading.py::test_loading      True        True   
3        tests/test_loading.py::test_loading

In [5]:
def match_class(content, class_name):
  start_index = content.find(f"class {class_name}")
  if start_index == -1:
    return None
  return content[start_index:]

In [6]:
def match_method(content, test_name):
  start_index = content.find(f"def {test_name}")
  if start_index == -1:
    return None

  end_index = -1
  next_def_index = content.find("def ", start_index + len(test_name))
  main_block_index = content.find("if __name__ == '__main__':", start_index)

  if next_def_index != -1:
    end_index = next_def_index
  elif main_block_index != -1:
    end_index = main_block_index
  else:
    end_index = len(content)

  return content[start_index:end_index]

In [7]:
def remove_square_brackets(test_name):
  start_index = test_name.find('[')
  end_index = test_name.find(']')

  if start_index != -1 and end_index != -1:
    new_test_name = test_name[:start_index]
    return new_test_name
  else:
    return test_name

In [8]:
def read_python_file(url):
  try:
    response = requests.get(url)
    response.raise_for_status()  # Check for HTTP errors
    # The content of the file is in response.text
    python_code = response.text
    return python_code
  except requests.exceptions.RequestException as e:
    print(f"Error fetching the file: {e}")
    return None

In [9]:
def special_cases(project_name, repo_url, commit_hash, test_file_path, class_name, test_name):
  if project_name == 'Butter.MAS.PythonAPI' and commit_hash == 'f86ebe75df3826f62a268645cdbe4400b43fab07' and test_file_path.startswith('butter/mas/tests/clients/'):
    test_file_path = 'butter/mas/tests/clients/client_test.py'
    class_name = ''

  if project_name == 'SNData' and commit_hash == 'e4854f0dc357484b437b15f9dac15f7c589eff58' and (class_name == 'Sako18Parsing' or class_name == 'DR1Parsing'):
    test_file_path = 'tests/data_parsing_template_tests.py'
    class_name = ''
    if test_name == 'test_standard_column_names':
      class_name = 'PhotometricDataParsing'

  if project_name == 'bootstrap_env' and commit_hash == 'ab68025d8f6b9a17d8feeed83e8aae26e3f28769' and test_file_path.startswith('bootstrap_env/tests/'):
    test_file_path = 'bootstrap_env/tests/base.py'

  if repo_url.endswith('.git'):
    repo_url = repo_url[:-4]

  if project_name == 'data-pypes' and test_name == 'pypes.logsetup.get_logconfig':
    test_name = 'get_logconfig'

  if project_name == 'elife-tools':
    end_index = test_name.find('_1_elife_02833_v2_xml')
    test_name = test_name[:end_index]

  if project_name == 'noipy':
    repo_url = 'https://github.com/pv8/noipy'

  if project_name == 'pymq' and commit_hash == '101857bca2b705c328d3bda3b26797b51e8ffb70':
    if class_name == 'SimplePubSubTest':
      test_file_path = 'tests/base/pubsub.py'
      class_name = 'AbstractPubSubTest'
    if class_name == 'IpcQueueTest':
      test_file_path = 'tests/base/queue.py'
      class_name = 'AbstractQueueTest'
    if class_name == 'IpcRpcTest' or class_name == 'SimpleRpcTest':
      test_file_path = 'tests/base/rpc.py'
      class_name = 'AbstractRpcTest'

  if project_name == 'pyswarms' and commit_hash == '08756526f39699eef28e515cac2ead17cef55710' and class_name == 'TestLocalBestOptimizer' and test_name == 'test_obj_with_kwargs':
    test_file_path = 'tests/optimizers/abc_test_optimizer.py'
    class_name = 'ABCTestOptimizer'

  if project_name == 'python-openflow' and commit_hash == 'a3387a7b28d529a3605aa1506a028e03394e4526' and class_name == 'TestFlowMod' and test_name == 'test_minimum_size':
    test_file_path = 'tests/unit/test_struct.py'
    class_name = 'TestStruct'

  return project_name, repo_url, commit_hash, test_file_path, class_name, test_name

In [10]:
all_test_methods = []
project_to_num_tests = dict()
print('The indices below corresponds to the row number in the csv file shown on https://sites.google.com/view/ipflakies.')

for index, row in data.iterrows():
  row_in_csv = index + 2
  project_name = row['Project_Name']
  repo_url = row['Project_URL']
  commit_hash = row['Project_Hash']
  if len(row['Test_id'].split('::')) > 2:
    test_file_path = row['Test_id'].split('::')[0]
    class_name = row['Test_id'].split('::')[1]
    test_name = row['Test_id'].split('::')[2]
  else:
    test_file_path = row['Test_id'].split('::')[0]
    class_name = ''
    test_name = row['Test_id'].split('::')[1]
  test_name = remove_square_brackets(test_name)
  detected = row['Detected']

  test_file_path_rec = test_file_path
  class_name_rec = class_name
  test_name_rec = test_name

  project_name, repo_url, commit_hash, test_file_path, class_name, test_name = special_cases(project_name, repo_url, commit_hash, test_file_path, class_name, test_name)

  new_url = repo_url.replace('github.com', 'raw.githubusercontent.com')
  new_url += '/' + commit_hash + '/' + test_file_path

  content = read_python_file(new_url)
  if content is None:
    print(f"Row {row_in_csv}: {repo_url}/blob/{commit_hash}/{test_file_path}, Error fetching the file\n")
    continue

  # Class content
  if class_name != '':
    content = match_class(content, class_name)
    if content is None:
      print(f"Row {row_in_csv}: In {repo_url}/blob/{commit_hash}/{test_file_path}, Class '{class_name}' not found.")
      continue

  # Test method content
  content = match_method(content, test_name)
  if content is None:
    print(f"Row {row_in_csv}: In {repo_url}/blob/{commit_hash}/{test_file_path}, Class '{class_name}' , Test function '{test_name}' not found.")
    continue

  test_method = {}
  test_method['Row'] = row_in_csv
  test_method['Project_Name'] = project_name
  test_method['URL'] = repo_url + '/blob/' + commit_hash + '/' + test_file_path_rec
  test_method['Class'] = class_name_rec
  test_method['Test'] = test_name_rec
  test_method['Content'] = content
  test_method['Detected'] = detected
  all_test_methods.append(test_method)
  project_to_num_tests[project_name] = project_to_num_tests.get(project_name, 0) + 1

The indices below corresponds to the row number in the csv file shown on https://sites.google.com/view/ipflakies.
Error fetching the file: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/AshtonUPS/Py-MI-PS/2d22327c75bac1b58a4804a61e7a703ecc5ba978/src/PyMIPS/tests/register_test.py
Row 135: https://github.com/AshtonUPS/Py-MI-PS/blob/2d22327c75bac1b58a4804a61e7a703ecc5ba978/src/PyMIPS/tests/register_test.py, Error fetching the file

Error fetching the file: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/cryptowatch/cw-sdk-python/92bd90db16dfc116c0708d19d27208d9bfc990c1/tests/test_api.py
Row 516: https://github.com/cryptowatch/cw-sdk-python/blob/92bd90db16dfc116c0708d19d27208d9bfc990c1/tests/test_api.py, Error fetching the file

Error fetching the file: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/cryptowatch/cw-sdk-python/92bd90db16dfc116c0708d19d27208d9bfc990c1/tests/test_api.py
Row 517: https://github.com/cryptowat

In [11]:
selected_projects = []
for project_name, num_tests in project_to_num_tests.items():
  if num_tests > 30:
    selected_projects.append(project_name)

In [12]:
selected_methods = dict()
for test_method in all_test_methods:
  if test_method['Project_Name'] in selected_projects:
    selected_methods[test_method['Project_Name']] = selected_methods.get(test_method['Project_Name'], []) + [test_method]

In [13]:
filename = "selected_methods.csv"
header_written = False
with open(filename, mode="w", newline="") as file:
  for project in selected_methods:
    writer = csv.DictWriter(file, fieldnames=selected_methods[project][0].keys())
    if not header_written:
      writer.writeheader()
      header_written = True
    writer.writerows(selected_methods[project])

In [14]:
# Only use this code block if you are using Google Colab.
# If you are using Jupyter Notebook, please ignore this code block. You can directly upload the file to your Jupyter Notebook file systems.
from google.colab import files
import pandas as pd
import os
import requests
import csv
import random
from copy import deepcopy
import json

if not os.path.exists('selected_methods.csv'):
  ## It will prompt you to select a local file. Click on “Choose Files” then select and upload the file.
  ## Wait for the file to be 100% uploaded. You should see the name of the file once Colab has uploaded it.
  uploaded = files.upload()
  selected_methods = dict()
  with open('selected_methods.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
      project_name = row['Project_Name']
      selected_methods[project_name] = selected_methods.get(project_name, []) + [row]

In [15]:
for project in selected_methods:
  print(project)

Butter.MAS.PythonAPI
centreon-sdk-python
cloudnetpy
crom
easypy
eppy
reframe
region_cache
taxi
wkr.py


In [16]:
example = None
for project in selected_methods:
  example = selected_methods[project][0]
  break

In [17]:
example

{'Row': 14,
 'Project_Name': 'Butter.MAS.PythonAPI',
 'URL': 'https://github.com/bennymeg/Butter.MAS.PythonAPI/blob/f86ebe75df3826f62a268645cdbe4400b43fab07/butter/mas/tests/clients/client_http_test.py',
 'Class': 'TestHttpClientApiMethods',
 'Test': 'testGetAvailableAnimations',
 'Content': 'def testGetAvailableAnimations(self):\n        self.assertIsNotNone(self.client.getAvailableAnimations())\n\n    ',
 'Detected': True}

In [18]:
total_true_size = 0
total_false_size = 0

for project in selected_methods:
  true_size = 0
  false_size = 0
  for test_method in selected_methods[project]:
    if test_method['Detected'] == True or test_method['Detected'] == 'True':
      true_size += 1
    else:
      false_size += 1
  total_true_size += true_size
  total_false_size += false_size
  print(f"{project}: True size: {true_size}, False size: {false_size}")

print(f"Total True size: {total_true_size}, Total False size: {total_false_size}, Total: {total_true_size + total_false_size}")

Butter.MAS.PythonAPI: True size: 53, False size: 0
centreon-sdk-python: True size: 41, False size: 0
cloudnetpy: True size: 51, False size: 0
crom: True size: 38, False size: 0
easypy: True size: 44, False size: 0
eppy: True size: 84, False size: 0
reframe: True size: 136, False size: 53
region_cache: True size: 0, False size: 37
taxi: True size: 0, False size: 77
wkr.py: True size: 0, False size: 34
Total True size: 447, Total False size: 201, Total: 648


In [19]:
os.makedirs("data")
for project in selected_methods:
  os.makedirs(f"data/{project}")

In [20]:
for test_project in selected_methods:
  training_and_validation_set = []
  test_set = []
  for project in selected_methods:
    if project == test_project:
      test_set = deepcopy(selected_methods[project])
    else:
      training_and_validation_set += deepcopy(selected_methods[project])

  random.shuffle(training_and_validation_set)
  training_set = training_and_validation_set[:int(len(training_and_validation_set) * 0.8)]
  validation_set = training_and_validation_set[int(len(training_and_validation_set) * 0.8):]

  training_set_true = []
  training_set_false = []
  for test_method in training_set:
    if test_method['Detected'] == True or test_method['Detected'] == 'True':
      training_set_true.append(test_method)
    else:
      training_set_false.append(test_method)

  training_set_true_size = len(training_set_true)
  training_set_false_size = len(training_set_false)

  print(f"Test Project - {test_project}: Training set size: {len(training_set)}, Validation set size: {len(validation_set)}, Test set size: {len(test_set)}")
  print(f"Training set true size: {training_set_true_size}, Training set false size: {training_set_false_size}")

  balanced_training_set = []
  if training_set_true_size > training_set_false_size:
    training_set_false_new = deepcopy(training_set_false)
    while training_set_true_size - len(training_set_false_new) > training_set_false_size:
      training_set_false_new += deepcopy(training_set_false)
    training_set_false_new += random.sample(deepcopy(training_set_false), training_set_true_size - len(training_set_false_new))
    balanced_training_set = training_set_true + training_set_false_new
    random.shuffle(balanced_training_set)
    print(f"Balanced training set size: {len(balanced_training_set)}")
    print(f"Balanced training set true size: {len(training_set_true)}, Balanced training set false size: {len(training_set_false_new)}")
  elif training_set_true_size < training_set_false_size:
    training_set_true_new = deepcopy(training_set_true)
    while training_set_false_size - len(training_set_true_new) > training_set_true_size:
      training_set_true_new += deepcopy(training_set_true)
    training_set_true_new += random.sample(deepcopy(training_set_true), training_set_false_size - len(training_set_true_new))
    balanced_training_set = training_set_true_new + training_set_false
    random.shuffle(balanced_training_set)
    print(f"Balanced training set size: {len(balanced_training_set)}")
    print(f"Balanced training set true size: {len(training_set_true_new)}, Balanced training set false size: {len(training_set_false)}")
  else:
    balanced_training_set = training_set

  training_messages_list = []
  for test_method in balanced_training_set:
    messages = [
        {"role": "system", "content": "You need to identify flaky tests."},
        {"role": "user", "content": f"Is this a flaky test? Only answer True or False.\n{test_method['Content']}"},
        {"role": "assistant", "content": str(test_method['Detected'])}
    ]
    training_messages_list.append({"messages": messages})

  with open(f"data/{test_project}/training_set.jsonl", 'w') as jsonl_file:
    for entry in training_messages_list:
      jsonl_file.write(json.dumps(entry) + "\n")

  validation_messages_list = []
  for test_method in validation_set:
    messages = [
        {"role": "system", "content": "You need to identify flaky tests."},
        {"role": "user", "content": f"Is this a flaky test? Only answer True or False.\n{test_method['Content']}"},
        {"role": "assistant", "content": str(test_method['Detected'])}
    ]
    validation_messages_list.append({"messages": messages})

  with open(f"data/{test_project}/validation_set.jsonl", 'w') as jsonl_file:
    for entry in validation_messages_list:
      jsonl_file.write(json.dumps(entry) + "\n")

  test_messages_list = []
  for test_method in test_set:
    messages = [
        {"role": "system", "content": "You need to identify flaky tests."},
        {"role": "user", "content": f"Is this a flaky test? Only answer True or False.\n{test_method['Content']}"},
        {"role": "assistant", "content": str(test_method['Detected'])}
    ]
    test_messages_list.append({"messages": messages})

  with open(f"data/{test_project}/test_set.jsonl", 'w') as jsonl_file:
    for entry in test_messages_list:
      jsonl_file.write(json.dumps(entry) + "\n")

  print("------------------------")

Test Project - Butter.MAS.PythonAPI: Training set size: 476, Validation set size: 119, Test set size: 53
Training set true size: 322, Training set false size: 154
Balanced training set size: 644
Balanced training set true size: 322, Balanced training set false size: 322
------------------------
Test Project - centreon-sdk-python: Training set size: 485, Validation set size: 122, Test set size: 41
Training set true size: 331, Training set false size: 154
Balanced training set size: 662
Balanced training set true size: 331, Balanced training set false size: 331
------------------------
Test Project - cloudnetpy: Training set size: 477, Validation set size: 120, Test set size: 51
Training set true size: 314, Training set false size: 163
Balanced training set size: 628
Balanced training set true size: 314, Balanced training set false size: 314
------------------------
Test Project - crom: Training set size: 488, Validation set size: 122, Test set size: 38
Training set true size: 330, Train

In [21]:
!zip -r /content/data.zip /content/data

  adding: content/data/ (stored 0%)
  adding: content/data/cloudnetpy/ (stored 0%)
  adding: content/data/cloudnetpy/validation_set.jsonl (deflated 83%)
  adding: content/data/cloudnetpy/test_set.jsonl (deflated 93%)
  adding: content/data/cloudnetpy/training_set.jsonl (deflated 85%)
  adding: content/data/crom/ (stored 0%)
  adding: content/data/crom/validation_set.jsonl (deflated 82%)
  adding: content/data/crom/test_set.jsonl (deflated 82%)
  adding: content/data/crom/training_set.jsonl (deflated 85%)
  adding: content/data/Butter.MAS.PythonAPI/ (stored 0%)
  adding: content/data/Butter.MAS.PythonAPI/validation_set.jsonl (deflated 83%)
  adding: content/data/Butter.MAS.PythonAPI/test_set.jsonl (deflated 97%)
  adding: content/data/Butter.MAS.PythonAPI/training_set.jsonl (deflated 85%)
  adding: content/data/reframe/ (stored 0%)
  adding: content/data/reframe/validation_set.jsonl (deflated 83%)
  adding: content/data/reframe/test_set.jsonl (deflated 90%)
  adding: content/data/refram