<a href="https://colab.research.google.com/github/yiw008/nondet-project/blob/main/Go_Through_IPFlakies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Go Through IPFlakies

## Preparation (Must Run At All Times)

In [None]:
import pandas as pd
import os
import requests
import csv
import random
from copy import deepcopy
import json
from google.colab import files

In [None]:
# URL of the CSV file: iPFlakies
url = "https://zenodo.org/records/6176417/files/Test_Status.csv"

In [None]:
# Read the CSV file from the URL
data = pd.read_csv(url)

In [None]:
# Display the first few rows of the data
print(data.head())

  Project_Name                               Project_URL  \
0   BT-Tracker  https://github.com/nordwind80/BT-Tracker   
1      Breathe         https://github.com/mrob95/Breathe   
2      Breathe         https://github.com/mrob95/Breathe   
3      Breathe         https://github.com/mrob95/Breathe   
4      Breathe         https://github.com/mrob95/Breathe   

                               Project_Hash  \
0  558c15b399871c1ca11d0c4ae1eb598e3060931e   
1  4600818e24f4156cd7bb8cc0f43886b27323968e   
2  4600818e24f4156cd7bb8cc0f43886b27323968e   
3  4600818e24f4156cd7bb8cc0f43886b27323968e   
4  4600818e24f4156cd7bb8cc0f43886b27323968e   

                                             Test_id  Detected  Have_Patch  \
0  Tracker/tests/test_event.py::TestEvent::test_o...      True        True   
1  tests/test_command_context.py::test_manual_con...      True       False   
2                tests/test_loading.py::test_loading      True        True   
3        tests/test_loading.py::test_loading

In [None]:
def match_class(content, class_name):
  start_index = content.find(f"class {class_name}")
  if start_index == -1:
    return None
  return content[start_index:]

In [None]:
def match_method(content, test_name):
  start_index = content.find(f"def {test_name}")
  if start_index == -1:
    return None

  end_index = -1
  next_def_index = content.find("def ", start_index + len(test_name))
  next_decorator_index = content.find("@pytest", start_index + len(test_name))
  next_class_index = content.find("class ", start_index + len(test_name))
  main_block_index = content.find("if __name__ == '__main__':", start_index)

  if next_def_index == -1:
    next_def_index = len(content)
  if next_decorator_index == -1:
    next_decorator_index = len(content)
  if next_class_index == -1:
    next_class_index = len(content)
  if main_block_index == -1:
    main_block_index = len(content)

  end_index = min(next_def_index, next_decorator_index, next_class_index, main_block_index)

  last_decorator_index = content.rfind("@pytest", 0, start_index)
  if last_decorator_index != -1 and content.find("def ", last_decorator_index, start_index) == -1:
    start_index = last_decorator_index

  return content[start_index:end_index]

In [None]:
def get_every_test_method(content, starter="def test"):
  methods = []
  start_index = 0
  while True:
    start_index = content.find(starter, start_index)
    if start_index == -1:
      break

    end_index = -1
    next_def_index = content.find("def ", start_index + len(starter))
    next_decorator_index = content.find("@pytest", start_index + len(starter))
    next_class_index = content.find("class ", start_index + len(starter))
    main_block_index = content.find("if __name__ == '__main__':", start_index)

    if next_def_index == -1:
      next_def_index = len(content)
    if next_decorator_index == -1:
      next_decorator_index = len(content)
    if next_class_index == -1:
      next_class_index = len(content)
    if main_block_index == -1:
      main_block_index = len(content)

    end_index = min(next_def_index, next_decorator_index, next_class_index, main_block_index)

    last_decorator_index = content.rfind("@pytest", 0, start_index)
    if last_decorator_index != -1 and content.find("def ", last_decorator_index, start_index) == -1:
      start_index = last_decorator_index

    method_content = content[start_index:end_index]
    methods.append(method_content)
    start_index = end_index

  return methods

In [None]:
def remove_square_brackets(test_name):
  start_index = test_name.find('[')
  end_index = test_name.find(']')

  if start_index != -1 and end_index != -1:
    new_test_name = test_name[:start_index]
    return new_test_name
  else:
    return test_name

In [None]:
def read_python_file(url):
  try:
    response = requests.get(url)
    response.raise_for_status()  # Check for HTTP errors
    # The content of the file is in response.text
    python_code = response.text
    return python_code
  except requests.exceptions.RequestException as e:
    print(f"Error fetching the file: {e}")
    return None

In [None]:
def special_cases(project_name, repo_url, commit_hash, test_file_path, class_name, test_name):
  if project_name == 'Butter.MAS.PythonAPI' and commit_hash == 'f86ebe75df3826f62a268645cdbe4400b43fab07' and test_file_path.startswith('butter/mas/tests/clients/'):
    test_file_path = 'butter/mas/tests/clients/client_test.py'
    class_name = ''

  if project_name == 'SNData' and commit_hash == 'e4854f0dc357484b437b15f9dac15f7c589eff58' and (class_name == 'Sako18Parsing' or class_name == 'DR1Parsing'):
    test_file_path = 'tests/data_parsing_template_tests.py'
    class_name = ''
    if test_name == 'test_standard_column_names':
      class_name = 'PhotometricDataParsing'

  if project_name == 'bootstrap_env' and commit_hash == 'ab68025d8f6b9a17d8feeed83e8aae26e3f28769' and test_file_path.startswith('bootstrap_env/tests/'):
    test_file_path = 'bootstrap_env/tests/base.py'

  if repo_url.endswith('.git'):
    repo_url = repo_url[:-4]

  if project_name == 'data-pypes' and test_name == 'pypes.logsetup.get_logconfig':
    test_name = 'get_logconfig'

  if project_name == 'elife-tools':
    end_index = test_name.find('_1_elife_02833_v2_xml')
    test_name = test_name[:end_index]

  if project_name == 'noipy':
    repo_url = 'https://github.com/pv8/noipy'

  if project_name == 'pymq' and commit_hash == '101857bca2b705c328d3bda3b26797b51e8ffb70':
    if class_name == 'SimplePubSubTest':
      test_file_path = 'tests/base/pubsub.py'
      class_name = 'AbstractPubSubTest'
    if class_name == 'IpcQueueTest':
      test_file_path = 'tests/base/queue.py'
      class_name = 'AbstractQueueTest'
    if class_name == 'IpcRpcTest' or class_name == 'SimpleRpcTest':
      test_file_path = 'tests/base/rpc.py'
      class_name = 'AbstractRpcTest'

  if project_name == 'pyswarms' and commit_hash == '08756526f39699eef28e515cac2ead17cef55710' and class_name == 'TestLocalBestOptimizer' and test_name == 'test_obj_with_kwargs':
    test_file_path = 'tests/optimizers/abc_test_optimizer.py'
    class_name = 'ABCTestOptimizer'

  if project_name == 'python-openflow' and commit_hash == 'a3387a7b28d529a3605aa1506a028e03394e4526' and class_name == 'TestFlowMod' and test_name == 'test_minimum_size':
    test_file_path = 'tests/unit/test_struct.py'
    class_name = 'TestStruct'

  return project_name, repo_url, commit_hash, test_file_path, class_name, test_name

## Get all the test methods

In [None]:
all_test_methods = []
print('The indices below corresponds to the row number in the csv file shown on https://sites.google.com/view/ipflakies.')

for index, row in data.iterrows():
  row_in_csv = index + 2
  project_name = row['Project_Name']
  repo_url = row['Project_URL']
  commit_hash = row['Project_Hash']
  if len(row['Test_id'].split('::')) > 2:
    test_file_path = row['Test_id'].split('::')[0]
    class_name = row['Test_id'].split('::')[1]
    test_name = row['Test_id'].split('::')[2]
  else:
    test_file_path = row['Test_id'].split('::')[0]
    class_name = ''
    test_name = row['Test_id'].split('::')[1]
  test_name = remove_square_brackets(test_name)
  detected = row['Detected']

  test_file_path_rec = test_file_path
  class_name_rec = class_name
  test_name_rec = test_name

  project_name, repo_url, commit_hash, test_file_path, class_name, test_name = special_cases(project_name, repo_url, commit_hash, test_file_path, class_name, test_name)

  new_url = repo_url.replace('github.com', 'raw.githubusercontent.com')
  new_url += '/' + commit_hash + '/' + test_file_path

  content = read_python_file(new_url)
  if content is None:
    print(f"Row {row_in_csv}: {repo_url}/blob/{commit_hash}/{test_file_path}, Error fetching the file\n")
    continue

  # Class content
  if class_name != '':
    content = match_class(content, class_name)
    if content is None:
      print(f"Row {row_in_csv}: In {repo_url}/blob/{commit_hash}/{test_file_path}, Class '{class_name}' not found.")
      continue

  # Test method content
  content = match_method(content, test_name)
  if content is None:
    print(f"Row {row_in_csv}: In {repo_url}/blob/{commit_hash}/{test_file_path}, Class '{class_name}' , Test function '{test_name}' not found.")
    continue

  test_method = {}
  test_method['Row'] = row_in_csv
  test_method['Project_Name'] = project_name
  test_method['URL'] = repo_url + '/blob/' + commit_hash + '/' + test_file_path_rec
  test_method['New URL'] = new_url
  test_method['Class'] = class_name_rec
  test_method['Test'] = test_name_rec
  test_method['Content'] = content
  test_method['Detected'] = detected
  all_test_methods.append(test_method)

The indices below corresponds to the row number in the csv file shown on https://sites.google.com/view/ipflakies.
Error fetching the file: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/AshtonUPS/Py-MI-PS/2d22327c75bac1b58a4804a61e7a703ecc5ba978/src/PyMIPS/tests/register_test.py
Row 135: https://github.com/AshtonUPS/Py-MI-PS/blob/2d22327c75bac1b58a4804a61e7a703ecc5ba978/src/PyMIPS/tests/register_test.py, Error fetching the file

Error fetching the file: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/cryptowatch/cw-sdk-python/92bd90db16dfc116c0708d19d27208d9bfc990c1/tests/test_api.py
Row 516: https://github.com/cryptowatch/cw-sdk-python/blob/92bd90db16dfc116c0708d19d27208d9bfc990c1/tests/test_api.py, Error fetching the file

Error fetching the file: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/cryptowatch/cw-sdk-python/92bd90db16dfc116c0708d19d27208d9bfc990c1/tests/test_api.py
Row 517: https://github.com/cryptowat

In [None]:
filename = "all_test_methods.csv"
with open(filename, mode="w", newline="") as file:
  writer = csv.DictWriter(file, fieldnames=all_test_methods[0].keys())
  writer.writeheader()
  writer.writerows(all_test_methods)

In [None]:
# Only use this code block if you are using Google Colab.
# If you are using Jupyter Notebook, please ignore this code block. You can directly upload the file to your Jupyter Notebook file systems.

if not os.path.exists('all_test_methods.csv'):
  ## It will prompt you to select a local file. Click on “Choose Files” then select and upload the file.
  ## Wait for the file to be 100% uploaded. You should see the name of the file once Colab has uploaded it.
  uploaded = files.upload()
  all_test_methods = []
  with open('all_test_methods.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
      all_test_methods.append(row)

## Select Projects

In [None]:
project_to_num_true_tests = dict()
for test_method in all_test_methods:
  project_name = test_method['Project_Name']
  if test_method['Detected'] == True or test_method['Detected'] == 'True':
    project_to_num_true_tests[project_name] = project_to_num_true_tests.get(project_name, 0) + 1

In [None]:
selected_projects = []
for project_name, num_true_tests in project_to_num_true_tests.items():
  if num_true_tests > 20:
    selected_projects.append(project_name)

In [None]:
selected_methods = dict()
for test_method in all_test_methods:
  if test_method['Project_Name'] in selected_projects:
    selected_methods[test_method['Project_Name']] = selected_methods.get(test_method['Project_Name'], []) + [test_method]

In [None]:
filename = "selected_methods.csv"
header_written = False
with open(filename, mode="w", newline="") as file:
  for project in selected_methods:
    writer = csv.DictWriter(file, fieldnames=selected_methods[project][0].keys())
    if not header_written:
      writer.writeheader()
      header_written = True
    writer.writerows(selected_methods[project])

In [None]:
# Only use this code block if you are using Google Colab.
# If you are using Jupyter Notebook, please ignore this code block. You can directly upload the file to your Jupyter Notebook file systems.

if not os.path.exists('selected_methods.csv'):
  ## It will prompt you to select a local file. Click on “Choose Files” then select and upload the file.
  ## Wait for the file to be 100% uploaded. You should see the name of the file once Colab has uploaded it.
  uploaded = files.upload()
  selected_methods = dict()
  with open('selected_methods.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
      project_name = row['Project_Name']
      selected_methods[project_name] = selected_methods.get(project_name, []) + [row]

## Refined Selected Methods

In [None]:
for project in selected_methods:
  print(project)

Butter.MAS.PythonAPI
flask-multi-redis
centreon-sdk-python
cloudnetpy
crom
easypy
eppy
pykicad
reframe
webssh


In [None]:
refined_selected_methods = dict()

In [None]:
def add_multiple_methods(project_name, project_methods, text_files, url_starter, new_url_starter):
  project_methods_dict = dict()
  res = []
  for method in project_methods:
    method_file = method['New URL'].split('/')[-1]
    project_methods_dict[method_file] = project_methods_dict.get(method_file, []) + [method]

  for test_file in test_files:
    existed_methods = set()
    if test_file in project_methods_dict:
      for method in project_methods_dict[test_file]:
        existed_methods.add(method['Test'])

    new_url = new_url_starter + test_file
    content = read_python_file(new_url)
    methods = get_every_test_method(content)

    for method in methods:
      method_split = method.split('\n')
      method_name = ''
      for split in method_split:
        if 'def ' in split:
          method_name = split
          break
      method_name = method_name[method_name.find('def ') + len('def '): method_name.find('(')]

      if method_name not in existed_methods:
        new_method = {}
        new_method['Row'] = -1
        new_method['Project_Name'] = project_name
        new_method['URL'] = url_starter + test_file
        new_method['New URL'] = new_url
        new_method['Class'] = ''
        new_method['Test'] = method_name
        new_method['Content'] = method
        new_method['Detected'] = 'False'
        res.append(new_method)

  return res

### 1. Butter.MAS.PythonAPI

In [None]:
project_1 = 'Butter.MAS.PythonAPI'
project_1_methods = deepcopy(selected_methods[project_1])

In [None]:
for method in project_1_methods:
  content = method['Content']
  if method['Class'] == 'TestHttpClientApiMethods':
    lines = content.splitlines()
    new_lines = [lines[0]] + ["        self.client = HttpClient('localhost')"] + lines[1:]
    content = "\n".join(new_lines)
    method['Content'] = content
  if method['Class'] == 'TestTcpClientApiMethods':
    lines = content.splitlines()
    new_lines = [lines[0]] + ["        self.client = TcpClient('localhost')"] + lines[1:]
    content = "\n".join(new_lines)
    method['Content'] = content
  if method['Class'] == 'TestUdpClientApiMethods':
    lines = content.splitlines()
    new_lines = [lines[0]] + ["        self.client = UdpClient('localhost')"] + lines[1:]
    content = "\n".join(new_lines)
    method['Content'] = content

In [None]:
new_method = ''
for method in project_1_methods:
  if method['Test'] == 'testGetMotorRegisterRange':
    new_method = deepcopy(method)
    new_method['Row'] = -1
    new_method['Class'] = 'TestUdpClientApiMethods'
    new_method['Content'].replace('HttpClient', 'UdpClient')
    new_method['Detected'] = 'False'
    break

project_1_methods.append(new_method)

In [None]:
test_files = ['packet_builder_test.py', 'packet_http_test.py', 'packet_tcp_test.py', 'packet_udp_test.py']

In [None]:
multiple_added_methods = add_multiple_methods(project_1, project_1_methods, test_files,
                                              'https://github.com/butter-robotics/Butter.MAS.PythonAPI/blob/f86ebe75df3826f62a268645cdbe4400b43fab07/butter/mas/tests/packets/',
                                              'https://raw.githubusercontent.com/butter-robotics/Butter.MAS.PythonAPI/f86ebe75df3826f62a268645cdbe4400b43fab07/butter/mas/tests/packets/')

In [None]:
project_1_methods += multiple_added_methods
refined_selected_methods[project_1] = project_1_methods

### 2. flask-multi-redis

In [None]:
project_2 = 'flask-multi-redis'
project_2_methods = deepcopy(selected_methods[project_2])
new_url = project_2_methods[0]['New URL']
content = read_python_file(new_url)
methods = get_every_test_method(content)

In [None]:
all_in_project_2 = dict()
flaky_method_names = set()

for method in methods:
  method_split = method.split('\n')
  method_name = ''
  for split in method_split:
    if 'def ' in split:
      method_name = split
      break
  method_name = method_name[method_name.find('def ') + len('def '): method_name.find('(')]
  all_in_project_2[method_name] = method

for method in project_2_methods:
  method_name = method['Test']
  if method['Detected'] == True or method['Detected'] == 'True':
    flaky_method_names.add(method_name)

for method in all_in_project_2:
  if method not in flaky_method_names:
    new_method = {}
    new_method['Row'] = -1
    new_method['Project_Name'] = project_2
    new_method['URL'] = project_2_methods[0]['URL']
    new_method['New URL'] = new_url
    new_method['Class'] = ''
    new_method['Test'] = method
    new_method['Content'] = all_in_project_2[method]
    new_method['Detected'] = 'False'
    project_2_methods.append(new_method)

In [None]:
test_files = ["test_flask_multi_redis.py"]

In [None]:
multiple_added_methods = add_multiple_methods(project_2, project_2_methods, test_files,
                                              'https://github.com/max-k/flask-multi-redis/blob/fa781d3598448a6429309a686de9a8adb53f9f34/test/unit/',
                                              'https://raw.githubusercontent.com/max-k/flask-multi-redis/fa781d3598448a6429309a686de9a8adb53f9f34/test/unit/')

In [None]:
project_2_methods += multiple_added_methods
refined_selected_methods[project_2] = project_2_methods

### 3. centreon-sdk-python

In [None]:
project_3 = 'centreon-sdk-python'
project_3_methods = deepcopy(selected_methods[project_3])

In [None]:
new_method = ''
for method in project_3_methods:
  if method['Class'] == 'TestResourceCFG':
    new_method = deepcopy(method)
    new_method['Row'] = -1
    new_method['Class'] = 'TestConnect'
    new_method['Test'] = 'test_connection'
    new_method['Content'] = match_method(read_python_file(new_method['New URL']), 'test_connection')
    new_method['Detected'] = 'False'
    break

project_3_methods.append(new_method)
refined_selected_methods[project_3] = project_3_methods

### 4. cloudnetpy

In [None]:
test_files = [
    "test_atmos.py",
    "test_ceilo.py",
    "test_ceilometer.py",
    "test_classify.py",
    "test_cloudnetarray.py",
    "test_datasource.py",
    "test_drizzle.py",
    "test_drizzle_error.py",
    "test_droplet.py",
    "test_falling.py",
    "test_freezing.py",
    "test_insects.py",
    "test_iwc.py",
    "test_jenoptik.py",
    "test_lidar.py",
    "test_lwc.py",
    "test_melting.py",
    "test_meta_for_old_files.py",
    "test_mira.py",
    "test_model.py",
    "test_mwr.py",
    "test_output.py",
    "test_plotting.py",
    "test_product_tools.py",
    "test_radar.py",
    "test_rpg.py",
    "test_utils.py",
    "test_vaisala.py",
]

In [None]:
project_4 = 'cloudnetpy'
project_4_methods = deepcopy(selected_methods[project_4])

In [None]:
multiple_added_methods = add_multiple_methods(project_4, project_4_methods, test_files,
                                              'https://github.com/tukiains/cloudnetpy/blob/26f2607b890630146469cfa410fce99438ceee3f/tests/unit/',
                                              'https://raw.githubusercontent.com/tukiains/cloudnetpy/26f2607b890630146469cfa410fce99438ceee3f/tests/unit/')

In [None]:
project_4_methods += multiple_added_methods
refined_selected_methods[project_4] = project_4_methods

### 5. crom

In [None]:
test_files = [
    "test_add_classification.py",
    "test_context.json",
    "test_currency.py",
    "test_dimensions.py",
    "test_model.py",
    "test_multiple_instantiation.py",
    "test_reader.py",
    "test_vocab.py",
]

In [None]:
project_5 = 'crom'
project_5_methods = deepcopy(selected_methods[project_5])

In [None]:
multiple_added_methods = add_multiple_methods(project_5, project_5_methods, test_files,
                                              'https://github.com/thegetty/crom/blob/98bb6be4e32b4c81eb7e0b5e841a915b015abaf0/tests/',
                                              'https://raw.githubusercontent.com/thegetty/crom/98bb6be4e32b4c81eb7e0b5e841a915b015abaf0/tests/')

In [None]:
project_5_methods += multiple_added_methods
refined_selected_methods[project_5] = project_5_methods

### 6. easypy

In [None]:
test_files = [
    "test_aliasing.py",
    "test_bunch.py",
    "test_caching.py",
    "test_collections.py",
    "test_colors.py",
    "test_concurrency.py",
    "test_contexts.py",
    "test_decorations.py",
    "test_deprecation.py",
    "test_exceptions.py",
    "test_humanize.py",
    "test_lockstep.py",
    "test_logging.py",
    "test_meta.py",
    "test_misc.py",
    "test_randutils.py",
    "test_rwlock.py",
    "test_semver.py",
    "test_signals.py",
    "test_sync.py",
    "test_timing.py",
    "test_typed_struct.py",
    "test_units.py",
    "test_ziplog.py",
]

In [None]:
project_6 = 'easypy'
project_6_methods = deepcopy(selected_methods[project_6])

In [None]:
multiple_added_methods = add_multiple_methods(project_6, project_6_methods, test_files,
                                              'https://github.com/weka/easypy/blob/9501c3ee03dcb1630e58cbb73b1647056907bcea/tests/',
                                              'https://raw.githubusercontent.com/weka/easypy/9501c3ee03dcb1630e58cbb73b1647056907bcea/tests/')

In [None]:
project_6_methods += multiple_added_methods
refined_selected_methods[project_6] = project_6_methods

### 7. eppy

In [None]:
test_files = [
    "test_IDF.py",
    "test_bunch_subclass.py",
    "test_bunchhelpers.py",
    "test_case_insensitive.py",
    "test_easyopen.py",
    "test_eppy.py",
    "test_examples.py",
    "test_fanpower.py",
    "test_function_helpers.py",
    "test_hvacbuilder.py",
    "test_idd_helpers.py",
    "test_iddgaps.py",
    "test_idf_helpers.py",
    "test_idfreader.py",
    "test_json_functions.py",
    "test_loopdiagram.py",
    "test_modeleditor.py",
    "test_modeleditor1.py",
    "test_parse_error.py",
    "test_readhtml.py",
    "test_reproduce_bugs.py",
    "test_runner.py",
    "test_simpleread.py",
    "test_thermal_properties.py",
    "test_walk_hvac.py",
]

In [None]:
project_7 = 'eppy'
project_7_methods = deepcopy(selected_methods[project_7])

In [None]:
multiple_added_methods = add_multiple_methods(project_7, project_7_methods, test_files,
                                              'https://github.com/santoshphilip/eppy/blob/98e58583dce6c0fcec9c7b1ff1142bae0a67ddc7/eppy/tests/',
                                              'https://raw.githubusercontent.com/santoshphilip/eppy/98e58583dce6c0fcec9c7b1ff1142bae0a67ddc7/eppy/tests/')

In [None]:
project_7_methods += multiple_added_methods
refined_selected_methods[project_7] = project_7_methods

### 8. pykicad

In [None]:
test_files = [
    "test_module.py",
    "test_pcb.py",
    "test_sexpr.py",
]

In [None]:
project_8 = 'pykicad'
project_8_methods = deepcopy(selected_methods[project_8])

In [None]:
multiple_added_methods = add_multiple_methods(project_8, project_8_methods, test_files,
                                              'https://github.com/dvc94ch/pykicad/blob/cdebcaeb4ab6c8903ebecfd0748f826ea406923f/tests/',
                                              'https://raw.githubusercontent.com/dvc94ch/pykicad/cdebcaeb4ab6c8903ebecfd0748f826ea406923f/tests/')

In [None]:
project_8_methods += multiple_added_methods
refined_selected_methods[project_8] = project_8_methods

### 9. reframe

In [None]:
test_files = [
    "test_argparser.py",
    "test_buildsystems.py",
    "test_check_filters.py",
    "test_cli.py",
    "test_color.py",
    "test_config.py",
    "test_containers.py",
    "test_deferrable.py",
    "test_dependencies.py",
    "test_environments.py",
    "test_exceptions.py",
    "test_fields.py",
    "test_launchers.py",
    "test_loader.py",
    "test_logging.py",
    "test_modules.py",
    "test_pipeline.py",
    "test_policies.py",
    "test_sanity_functions.py",
    "test_schedulers.py",
    "test_shell.py",
    "test_typecheck.py",
    "test_utility.py",
    "test_versioning.py",
]

In [None]:
project_9 = 'reframe'
project_9_methods = deepcopy(selected_methods[project_9])
multiple_added_methods = add_multiple_methods(project_9, project_9_methods, test_files,
                                              'https://github.com/reframe-hpc/reframe/blob/576eb3f1dcc015d1e6d7a10602c748d4f810da68/unittests/',
                                              'https://raw.githubusercontent.com/reframe-hpc/reframe/576eb3f1dcc015d1e6d7a10602c748d4f810da68/unittests/')

In [None]:
project_9_methods += multiple_added_methods
refined_selected_methods[project_9] = project_9_methods

### 10. webssh

In [None]:
test_files = [
    "test_app.py",
    "test_handler.py",
    "test_main.py",
    "test_policy.py",
    "test_settings.py",
    "test_utils.py",
]

In [None]:
project_10 = 'webssh'
project_10_methods = deepcopy(selected_methods[project_10])
multiple_added_methods = add_multiple_methods(project_10, project_10_methods, test_files,
                                              'https://github.com/huashengdun/webssh/blob/51d527fe75a62aed239126a6697749a30baecb30/tests/',
                                              'https://raw.githubusercontent.com/huashengdun/webssh/51d527fe75a62aed239126a6697749a30baecb30/tests/')

In [None]:
project_10_methods += multiple_added_methods
refined_selected_methods[project_10] = project_10_methods

## Store Refined Selected Methods

In [None]:
filename = "refined_selected_methods.csv"
header_written = False
with open(filename, mode="w", newline="") as file:
  for project in refined_selected_methods:
    writer = csv.DictWriter(file, fieldnames=refined_selected_methods[project][0].keys())
    if not header_written:
      writer.writeheader()
      header_written = True
    writer.writerows(refined_selected_methods[project])

In [None]:
# Only use this code block if you are using Google Colab.
# If you are using Jupyter Notebook, please ignore this code block. You can directly upload the file to your Jupyter Notebook file systems.

if not os.path.exists('refined_selected_methods.csv'):
  ## It will prompt you to select a local file. Click on “Choose Files” then select and upload the file.
  ## Wait for the file to be 100% uploaded. You should see the name of the file once Colab has uploaded it.
  uploaded = files.upload()
  refined_selected_methods = dict()
  with open('selected_methods.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
      project_name = row['Project_Name']
      refined_selected_methods[project_name] = refined_selected_methods.get(project_name, []) + [row]

## Statistics

In [None]:
example = None
for project in refined_selected_methods:
  example = refined_selected_methods[project][0]
  break

In [None]:
example

{'Row': 14,
 'Project_Name': 'Butter.MAS.PythonAPI',
 'URL': 'https://github.com/bennymeg/Butter.MAS.PythonAPI/blob/f86ebe75df3826f62a268645cdbe4400b43fab07/butter/mas/tests/clients/client_http_test.py',
 'New URL': 'https://raw.githubusercontent.com/bennymeg/Butter.MAS.PythonAPI/f86ebe75df3826f62a268645cdbe4400b43fab07/butter/mas/tests/clients/client_test.py',
 'Class': 'TestHttpClientApiMethods',
 'Test': 'testGetAvailableAnimations',
 'Content': "def testGetAvailableAnimations(self):\n        self.client = HttpClient('localhost')\n        self.assertIsNotNone(self.client.getAvailableAnimations())\n\n    ",
 'Detected': True}

In [None]:
total_true_size = 0
total_false_size = 0

print('Statistics Before Refinement')

for project in selected_methods:
  true_size = 0
  false_size = 0
  for test_method in selected_methods[project]:
    if test_method['Detected'] == True or test_method['Detected'] == 'True':
      true_size += 1
    else:
      false_size += 1
  total_true_size += true_size
  total_false_size += false_size
  print(f"{project}: True size: {true_size}, False size: {false_size}")

print(f"Total True size: {total_true_size}, Total False size: {total_false_size}, Total: {total_true_size + total_false_size}")

Statistics Before Refinement
Butter.MAS.PythonAPI: True size: 53, False size: 0
flask-multi-redis: True size: 26, False size: 0
centreon-sdk-python: True size: 41, False size: 0
cloudnetpy: True size: 51, False size: 0
crom: True size: 38, False size: 0
easypy: True size: 44, False size: 0
eppy: True size: 84, False size: 0
pykicad: True size: 28, False size: 0
reframe: True size: 136, False size: 53
webssh: True size: 29, False size: 0
Total True size: 530, Total False size: 53, Total: 583


In [None]:
total_true_size = 0
total_false_size = 0

print('Statistics After Refinement')

for project in refined_selected_methods:
  true_size = 0
  false_size = 0
  for test_method in refined_selected_methods[project]:
    if test_method['Detected'] == True or test_method['Detected'] == 'True':
      true_size += 1
    else:
      false_size += 1
  total_true_size += true_size
  total_false_size += false_size
  print(f"{project}: True size: {true_size}, False size: {false_size}")

print(f"Total True size: {total_true_size}, Total False size: {total_false_size}, Total: {total_true_size + total_false_size}")

Statistics After Refinement
Butter.MAS.PythonAPI: True size: 53, False size: 11
flask-multi-redis: True size: 26, False size: 7
centreon-sdk-python: True size: 41, False size: 1
cloudnetpy: True size: 51, False size: 246
crom: True size: 38, False size: 46
easypy: True size: 44, False size: 130
eppy: True size: 84, False size: 110
pykicad: True size: 28, False size: 20
reframe: True size: 136, False size: 465
webssh: True size: 29, False size: 60
Total True size: 530, Total False size: 1096, Total: 1626


## Create Training/Test Sets

In [None]:
os.makedirs("data")
for project in refined_selected_methods:
  os.makedirs(f"data/{project}")

In [None]:
for test_project in refined_selected_methods:
  training_set = []
  test_set = []
  for project in refined_selected_methods:
    if project == test_project:
      test_set = deepcopy(refined_selected_methods[project])
    else:
      training_set += deepcopy(refined_selected_methods[project])

  random.shuffle(training_set)
  random.shuffle(test_set)

  training_set_true = []
  training_set_false = []
  for test_method in training_set:
    if test_method['Detected'] == True or test_method['Detected'] == 'True':
      training_set_true.append(test_method)
    else:
      training_set_false.append(test_method)

  training_set_true_size = len(training_set_true)
  training_set_false_size = len(training_set_false)

  print(f"Test Project - {test_project}: Training set size: {len(training_set)}, Test set size: {len(test_set)}")
  print(f"Training set true size: {training_set_true_size}, Training set false size: {training_set_false_size}")

  balanced_training_set = []
  if training_set_true_size > training_set_false_size:
    training_set_false_new = deepcopy(training_set_false)
    while training_set_true_size - len(training_set_false_new) > training_set_false_size:
      training_set_false_new += deepcopy(training_set_false)
    training_set_false_new += random.sample(deepcopy(training_set_false), training_set_true_size - len(training_set_false_new))
    balanced_training_set = training_set_true + training_set_false_new
    random.shuffle(balanced_training_set)
    print(f"Balanced training set size: {len(balanced_training_set)}")
    print(f"Balanced training set true size: {len(training_set_true)}, Balanced training set false size: {len(training_set_false_new)}")
  elif training_set_true_size < training_set_false_size:
    training_set_true_new = deepcopy(training_set_true)
    while training_set_false_size - len(training_set_true_new) > training_set_true_size:
      training_set_true_new += deepcopy(training_set_true)
    training_set_true_new += random.sample(deepcopy(training_set_true), training_set_false_size - len(training_set_true_new))
    balanced_training_set = training_set_true_new + training_set_false
    random.shuffle(balanced_training_set)
    print(f"Balanced training set size: {len(balanced_training_set)}")
    print(f"Balanced training set true size: {len(training_set_true_new)}, Balanced training set false size: {len(training_set_false)}")
  else:
    balanced_training_set = training_set

  random.shuffle(balanced_training_set)

  training_messages_list = []
  for test_method in balanced_training_set:
    messages = [
        {"role": "system", "content": "You need to identify flaky tests."},
        {"role": "user", "content": f"Is this a flaky test? Only answer True or False.\n{test_method['Content']}"},
        {"role": "assistant", "content": str(test_method['Detected'])}
    ]
    training_messages_list.append({"messages": messages})

  with open(f"data/{test_project}/training_set.jsonl", 'w') as jsonl_file:
    for entry in training_messages_list:
      jsonl_file.write(json.dumps(entry) + "\n")

  test_messages_list = []
  for test_method in test_set:
    messages = [
        {"role": "system", "content": "You need to identify flaky tests."},
        {"role": "user", "content": f"Is this a flaky test? Only answer True or False.\n{test_method['Content']}"},
        {"role": "assistant", "content": str(test_method['Detected'])}
    ]
    test_messages_list.append({"messages": messages})

  with open(f"data/{test_project}/test_set.jsonl", 'w') as jsonl_file:
    for entry in test_messages_list:
      jsonl_file.write(json.dumps(entry) + "\n")

  print("------------------------")

Test Project - Butter.MAS.PythonAPI: Training set size: 1562, Test set size: 64
Training set true size: 477, Training set false size: 1085
Balanced training set size: 2170
Balanced training set true size: 1085, Balanced training set false size: 1085
------------------------
Test Project - flask-multi-redis: Training set size: 1593, Test set size: 33
Training set true size: 504, Training set false size: 1089
Balanced training set size: 2178
Balanced training set true size: 1089, Balanced training set false size: 1089
------------------------
Test Project - centreon-sdk-python: Training set size: 1584, Test set size: 42
Training set true size: 489, Training set false size: 1095
Balanced training set size: 2190
Balanced training set true size: 1095, Balanced training set false size: 1095
------------------------
Test Project - cloudnetpy: Training set size: 1329, Test set size: 297
Training set true size: 479, Training set false size: 850
Balanced training set size: 1700
Balanced training

In [None]:
!zip -r /content/data.zip /content/data

  adding: content/data/ (stored 0%)
  adding: content/data/pykicad/ (stored 0%)
  adding: content/data/pykicad/test_set.jsonl (deflated 89%)
  adding: content/data/pykicad/training_set.jsonl (deflated 83%)
  adding: content/data/centreon-sdk-python/ (stored 0%)
  adding: content/data/centreon-sdk-python/test_set.jsonl (deflated 93%)
  adding: content/data/centreon-sdk-python/training_set.jsonl (deflated 83%)
  adding: content/data/eppy/ (stored 0%)
  adding: content/data/eppy/test_set.jsonl (deflated 85%)
  adding: content/data/eppy/training_set.jsonl (deflated 85%)
  adding: content/data/reframe/ (stored 0%)
  adding: content/data/reframe/test_set.jsonl (deflated 85%)
  adding: content/data/reframe/training_set.jsonl (deflated 84%)
  adding: content/data/cloudnetpy/ (stored 0%)
  adding: content/data/cloudnetpy/test_set.jsonl (deflated 88%)
  adding: content/data/cloudnetpy/training_set.jsonl (deflated 83%)
  adding: content/data/flask-multi-redis/ (stored 0%)
  adding: content/data/f