KNN

In [1]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Vectorizes the data points using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# KNN with GridSearchCV and Multiple Scoring Metrics

def flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define the KNN model
    knn = KNeighborsClassifier()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_neighbors': [3, 5, 7],  # Hyperparameter for k in KNN
        'metric': ['cosine', 'euclidean'],  # Distance metrics
    }

    # Custom scoring functions for precision, recall, accuracy, and F1 score
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score)
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for accuracy
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Accuracy Score: {best_score}")

    # Save the results
    outFile = f"params-knn-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_neighbors,metric,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPoints)  
            fo.write(f"{param['n_neighbors']},{param['metric']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/cleaned_flaky_files.zip"
    nonFlakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform KNN analysis for 5 folds and 3 folds
    print("Starting 5-fold analysis...")
    best_params_5folds, best_score_5folds = flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 5)

    print("Starting 3-fold analysis...")
    best_params_3folds, best_score_3folds = flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 3)

    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best Accuracy Score: {best_score_5folds}")

    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best Accuracy Score: {best_score_3folds}")


Starting 5-fold analysis...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Sample document: # -*- coding: utf-8 -*-

# This code is part of Qiskit.
#
# (C) Copyright IBM 2020.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.

"""General utility functions for testing."""

from qiskit import QuantumCircuit
from qiskit.providers.ibmq.ibmqbackend import IBMQBackend


def most_busy_backend(provider):
    """Return the most busy backend for the provider given.

    Return the most busy available backend for those that
    have a `pending_jobs` in their `status`. Backends such as
    

SVM

In [2]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Vectorizes the data points using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# SVM with GridSearchCV and Multiple Scoring Metrics

def flastSVMWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define the SVM model
    svm = SVC()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'C': [0.1, 1.0, 10.0],  # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # Kernel types
    }

    # Custom scoring functions for precision, recall, accuracy, and F1 score
    scoring = {
        'precision': make_scorer(precision_score, zero_division=0),
        'recall': make_scorer(recall_score, zero_division=0),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=0)  
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(svm, param_grid, cv=skf, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for accuracy
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Accuracy Score: {best_score}")

    # Save the results
    outFile = f"params-svm-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("C,kernel,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPoints) 
            fo.write(f"{param['C']},{param['kernel']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"SVM analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "compressedDataset/cleaned_flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform SVM analysis for 5 folds and 3 folds
    print("Starting 5-fold analysis...")
    best_params_5folds, best_score_5folds = flastSVMWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 5)

    print("Starting 3-fold analysis...")
    best_params_3folds, best_score_3folds = flastSVMWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 3)

    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best Accuracy Score: {best_score_5folds}")

    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best Accuracy Score: {best_score_3folds}")


Starting 5-fold analysis...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'svm__C': 10.0, 'svm__kernel': 'rbf'}
Best Accuracy Score: 0.7444444444444445
SVM analysis completed for 5-folds. Results saved to: params-svm-5-folds.csv
Starting 3-fold analysis...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'svm__C': 10.0, 'svm__kernel': 'rbf'}
Best Accuracy Score: 0.7234543010752689
SVM analysis completed for 3-folds. Results saved to: params-svm-3-folds.csv
Best results for 5-fold:
Best Parameters: {'svm__C': 10.0, 'svm__kernel': 'rbf'}
Best Accuracy Score: 0.7444444444444445
Best results for 3-fold:
Best Parameters: {'svm__C': 10.0, 'svm__kernel': 'rbf'}
Best Accuracy Score: 0.7234543010752689


NB

In [5]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Vectorizes the data points using CountVectorizer without dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# Naive Bayes with GridSearchCV and Multiple Scoring Metrics

def flastNBWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization without Random Projection
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define the Naive Bayes model
    nb = MultinomialNB()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'alpha': [0.01, 0.1, 1.0, 10.0],  # Laplace smoothing parameter
    }

    # Custom scoring functions for precision, recall, accuracy, and F1 score
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score)  
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(nb, param_grid, cv=skf, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for accuracy
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Accuracy Score: {best_score}")

    # Save the results
    outFile = f"params-nb-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("alpha,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPoints)  
            fo.write(f"{param['alpha']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"Naive Bayes analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "compressedDataset/cleaned_flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform Naive Bayes analysis for 5 folds and 3 folds
    print("Starting 5-fold analysis...")
    best_params_5folds, best_score_5folds = flastNBWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 5)

    print("Starting 3-fold analysis...")
    best_params_3folds, best_score_3folds = flastNBWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 3)

    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best Accuracy Score: {best_score_5folds}")

    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best Accuracy Score: {best_score_3folds}")


Starting 5-fold analysis...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'nb__alpha': 0.01}
Best Accuracy Score: 0.7760233918128655
Naive Bayes analysis completed for 5-folds. Results saved to: params-nb-5-folds.csv
Starting 3-fold analysis...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Parameters: {'nb__alpha': 0.1}
Best Accuracy Score: 0.7345430107526881
Naive Bayes analysis completed for 3-folds. Results saved to: params-nb-3-folds.csv
Best results for 5-fold:
Best Parameters: {'nb__alpha': 0.01}
Best Accuracy Score: 0.7760233918128655
Best results for 3-fold:
Best Parameters: {'nb__alpha': 0.1}
Best Accuracy Score: 0.7345430107526881


XG Boost

In [1]:
import os
import time
import warnings
import pickle
import numpy as np
import zipfile

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.exceptions import UndefinedMetricWarning
from itertools import product
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from xgboost import XGBClassifier  # Import XGBoost Classifier

import csv

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

###############################################################################
# read data from file

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        
def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def vectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # Disable stop words filtering
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def xgboostWithGridSearch(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps):
    v0 = time.perf_counter()
    
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = vectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    classifier = XGBClassifier(random_state=42)

    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'gamma': [0, 0.1],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1],
        'min_child_weight': [1, 3]
    }
    
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score)
    }
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(classifier, param_grid, cv=skf, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(dataPointsList.reshape(len(dataPointsList), -1), dataLabelsList)

    # Get the best parameters and the best score for accuracy
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Accuracy Score: {best_score}")

    # Save the results
    outFile = f"params-xgb-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_estimators,max_depth,learning_rate,gamma,subsample,colsample_bytree,min_child_weight,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPointsList)  # Estimating preparation time
            fo.write(f"{param.get('n_estimators', '-')},{param.get('max_depth', '-')},{param.get('learning_rate', '-')},{param.get('gamma', '-')},{param.get('subsample', '-')},{param.get('colsample_bytree', '-')},{param.get('min_child_weight', '-')},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"XGBoost analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "compressedDataset/cleaned_flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results/results_XGBoost"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform XGBoost analysis for 5 folds and 3 folds
    dim = 100  # Example value for JL dimensionality reduction
    eps = 0.3  # JL epsilon

    print("Starting 5-fold analysis with XGBoost...")
    best_params_5folds, best_score_5folds = xgboostWithGridSearch(outDir, flakyZip, nonFlakyZip, extractDir, 5, dim, eps)

    print("Starting 3-fold analysis with XGBoost...")
    best_params_3folds, best_score_3folds = xgboostWithGridSearch(outDir, flakyZip, nonFlakyZip, extractDir, 3, dim, eps)

    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best Accuracy Score: {best_score_5folds}")

    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best Accuracy Score: {best_score_3folds}")


Starting 5-fold analysis with XGBoost...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticmethod
    @abc.a

Random Forest

In [2]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Random Forest with GridSearchCV and Multiple Scoring Metrics

def flastRandomForestWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define the Random Forest classifier
    classifier = RandomForestClassifier(random_state=42)

    # Define parameter grid for hyperparameter tuning without 'rf__' prefixes
    param_grid = {
        "n_estimators": [50, 100, 200],
        "criterion": ["gini", "entropy"],
        "max_depth": [10, 30, 50, 100, 300, 500],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    }

    # Custom scoring functions for precision, recall, and accuracy
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score)
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(
        classifier,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='accuracy',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on training data
    grid_search.fit(dataPointsList.reshape(len(dataPointsList), -1), dataLabelsList)

    # Get the best parameters and the best score for accuracy
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Accuracy Score: {best_score}")

    # Save the results
    outFile = f"params-rf-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_estimators,max_depth,criterion,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPointsList)  # Estimating preparation time
            fo.write(f"{param['n_estimators']},{param['max_depth']},{param['criterion']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "compressedDataset/cleaned_flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results/results_RandomForst"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform Random Forest analysis for 5 folds and 3 folds
    dim = 100  # Example value for JL dimensionality reduction
    eps = 0.3  # JL epsilon

    print("Starting 5-fold analysis...")
    best_params_5folds, best_score_5folds = flastRandomForestWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 5, dim, eps)

    print("Starting 3-fold analysis...")
    best_params_3folds, best_score_3folds = flastRandomForestWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 3, dim, eps)

    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best Accuracy Score: {best_score_5folds}")

    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best Accuracy Score: {best_score_3folds}")

    '''
    Take best parameters
    Fit them inside model
    Hyper-tune
    Visualize using heatmap
    Overleaf (writing)
    '''


Starting 5-fold analysis...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticmethod
    @abc.abstractmethod

Decision Tree

In [1]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Decision Tree with GridSearchCV and Multiple Scoring Metrics

def flastDecisionTreeWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define the Decision Tree classifier
    classifier = DecisionTreeClassifier(random_state=42)

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        "criterion": ["gini", "entropy"],
        "max_depth": [None, 10, 20, 30, 40, 50],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": [None, "sqrt", "log2"]
    }

    # Custom scoring functions for precision, recall, and accuracy
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score)
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(
        classifier,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='accuracy',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on training data
    grid_search.fit(dataPointsList.reshape(len(dataPointsList), -1), dataLabelsList)

    # Get the best parameters and the best score for accuracy
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Accuracy Score: {best_score}")

    # Save the results
    outFile = f"params-dt-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("criterion,max_depth,min_samples_split,min_samples_leaf,max_features,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPointsList)  # Estimating preparation time
            fo.write(f"{param['criterion']},{param['max_depth']},{param['min_samples_split']},{param['min_samples_leaf']},{param['max_features']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "compressedDataset/cleaned_flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results/results_DecisionTree"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform Decision Tree analysis for 5 folds and 3 folds
    dim = 100  # Example value for JL dimensionality reduction
    eps = 0.3  # JL epsilon

    print("Starting 5-fold analysis with Decision Tree...")
    best_params_5folds, best_score_5folds = flastDecisionTreeWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 5, dim, eps)

    print("Starting 3-fold analysis with Decision Tree...")
    best_params_3folds, best_score_3folds = flastDecisionTreeWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 3, dim, eps)

    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best Accuracy Score: {best_score_5folds}")

    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best Accuracy Score: {best_score_3folds}")


Starting 5-fold analysis with Decision Tree...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Sample document: # -*- coding: utf-8 -*-

# This code is part of Qiskit.
#
# (C) Copyright IBM 2020.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.

"""General utility functions for testing."""

from qiskit import QuantumCircuit
from qiskit.providers.ibmq.ibmqbackend import IBMQBackend


def most_busy_backend(provider):
    """Return the most busy backend for the provider given.

    Return the most busy available backend for those that
    have a `pending_jobs` in their `status`. Ba