KNN

In [2]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    try:
        precision = precision_score(testLabels, predictLabels)
        recall = recall_score(testLabels, predictLabels)
        accuracy = accuracy_score(testLabels, predictLabels)
        print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")
    except Exception as e:
        print(f"Error computing metrics: {e}")
        precision = recall = accuracy = "-"
    return precision, recall, accuracy

###############################################################################
# FLAST functions

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def flastClassification(trainData, trainLabels, testData, sigma, k, params):
    kNN = KNeighborsClassifier(
        algorithm=params["algorithm"],
        metric=params["metric"],
        weights=params["weights"],
        n_neighbors=k,
        n_jobs=1
    )
    t0 = time.perf_counter()
    kNN.fit(trainData, trainLabels)
    trainTime = time.perf_counter() - t0

    t0 = time.perf_counter()
    predictLabels = kNN.predict(testData)
    testTime = time.perf_counter() - t0
    
    return trainTime, testTime, predictLabels

###############################################################################
# FLAST KNN analysis

def flastKNN(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, sigma, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Storage calculation
    kNN = (dataPointsList, dataLabelsList)
    pickleDumpKNN = os.path.join(outDir, f"flast-sigma{sigma}.pickle")
    with open(pickleDumpKNN, "wb") as pickleFile:
        pickle.dump(kNN, pickleFile)
    storage = os.path.getsize(pickleDumpKNN)
    os.remove(pickleDumpKNN)

    avgP, avgR, avgA = 0, 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0

    for trnIdx, tstIdx in kf.split(dataPointsList, dataLabelsList):
        try:
            trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
            trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
            if sum(trainLabels) == 0 or sum(testLabels) == 0:
                print("Skipping fold with no flaky/non-flaky examples...")
                continue

            successFold += 1
            avgFlakyTrain += sum(trainLabels)
            avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
            avgFlakyTest += sum(testLabels)
            avgNonFlakyTest += len(testLabels) - sum(testLabels)

            trainData = trainData.reshape((trainData.shape[0], -1))
            testData = testData.reshape((testData.shape[0], -1))

            trainTime, testTime, predictLabels = flastClassification(trainData, trainLabels, testData, sigma, params["n_neighbors"], params)
            preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
            predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
            precision, recall, accuracy = computeResults(testLabels, predictLabels)

            print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")
            if precision != "-":
                precisionFold += 1
                avgP += precision
                avgA += accuracy
            avgR += recall
            avgTPrep += preparationTime
            avgTPred += predictionTime
        except Exception as e:
            print(f"An error occurred during KNN analysis: {e}")
            continue

    if precisionFold > 0:
        avgP /= precisionFold
        avgA /= precisionFold
    else:
        avgP = avgA = "-"
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, avgA, storage, avgTPrep, avgTPred

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/cleaned_flaky_files.zip"
    nonFlakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 5  # Change number of splits for cross-validation
    kf = StratifiedKFold(n_splits=numSplit, shuffle=True, random_state=42)

    outFile = "modified-params-knn.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("distance,k,sigma,eps,avgFlakyTrain,avgNonFlakyTrain,avgFlakyTest,avgNonFlakyTest,precision,recall,accuracy,storage,preparationTime,predictionTime\n")

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'metric': ['cosine', 'euclidean'],
        'n_neighbors': [3, 5, 7],
        'dim': [0, 100, 200],  # Number of dimensions (0: JL with error eps)
        'eps': [0.1, 0.3, 0.5],  # JL epsilon
        'sigma': [0.5]  # Can add more values if needed to tune sigma
    }

    best_score = -1
    best_params = None

    # Grid search for hyperparameter tuning
    for metric in param_grid['metric']:
        for k in param_grid['n_neighbors']:
            for dim in param_grid['dim']:
                for eps in param_grid['eps']:
                    for sigma in param_grid['sigma']:
                        print(f"Testing: {metric=}, {k=}, {dim=}, {eps=}, {sigma=}")
                        params = {"algorithm": "brute", "metric": metric, "weights": "uniform", "n_neighbors": k}
                        try:
                            results = flastKNN(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, sigma, params)
                            
                            # Flatten the results tuple and write to CSV
                            results_flat = ','.join(map(str, results))
                            print(f"Results to be written: {metric}, {k}, {sigma}, {eps}, {results_flat}")
                            
                            with open(os.path.join(outDir, outFile), "a") as fo:
                                fo.write(f"{metric},{k},{sigma},{eps},{results_flat}\n")
                            
                            # Determine the best parameters based on precision or another metric
                            if results[4] != '-' and results[4] > best_score:
                                best_score = results[4]
                                best_params = (metric, k, sigma, eps, dim)
                        except Exception as e:
                            print(f"Failed to complete KNN analysis: {e}")

    print("KNN analysis completed. Results saved to:", outFile)
    if best_params:
        print(f"Best Parameters: Metric={best_params[0]}, k={best_params[1]}, sigma={best_params[2]}, eps={best_params[3]}, dim={best_params[4]}")
    else:
        print("No valid results were found.")

Testing: metric='cosine', k=3, dim=0, eps=0.1, sigma=0.5
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticm

  _warn_prf(average, modifier, msg_start, len(result))


SVM

In [3]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    precision = precision_score(testLabels, predictLabels, zero_division=0)
    recall = recall_score(testLabels, predictLabels, zero_division=0)
    accuracy = accuracy_score(testLabels, predictLabels)
    return precision, recall, accuracy

###############################################################################
# FLAST functions

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def flastSVMClassification(trainData, trainLabels, testData, C, kernel):
    t0 = time.perf_counter()
    svm = SVC(C=C, kernel=kernel)
    svm.fit(trainData, trainLabels)
    trainTime = time.perf_counter() - t0

    t0 = time.perf_counter()
    predictLabels = svm.predict(testData)
    testTime = time.perf_counter() - t0

    return trainTime, testTime, predictLabels

###############################################################################
# FLAST SVM analysis

def flastSVM(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, C, kernel):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Storage calculation
    svm = (dataPointsList, dataLabelsList)
    pickleDumpSVM = os.path.join(outDir, f"flast-svm-C{C}-kernel{kernel}.pickle")
    with open(pickleDumpSVM, "wb") as pickleFile:
        pickle.dump(svm, pickleFile)
    storage = os.path.getsize(pickleDumpSVM)
    os.remove(pickleDumpSVM)

    avgP, avgR, avgA = 0, 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0

    for trnIdx, tstIdx in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold with no flaky/non-flaky examples...")
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        trainData = trainData.reshape((trainData.shape[0], -1))
        testData = testData.reshape((testData.shape[0], -1))

        trainTime, testTime, predictLabels = flastSVMClassification(trainData, trainLabels, testData, C, kernel)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        precision, recall, accuracy = computeResults(testLabels, predictLabels)

        print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")
        if precision != "-":
            precisionFold += 1
            avgP += precision
            avgA += accuracy
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold > 0:
        avgP /= precisionFold
        avgA /= precisionFold
    else:
        avgP = avgA = "-"
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, avgA, storage, avgTPrep, avgTPred

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/cleaned_flaky_files.zip"
    nonFlakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results-svm/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 5  # Reduced number of splits for cross-validation
    kf = StratifiedKFold(n_splits=numSplit, shuffle=True, random_state=42)

    outFile = "params-svm.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("kernel,C,sigma,eps,avgFlakyTrain,avgNonFlakyTrain,avgFlakyTest,avgNonFlakyTest,precision,recall,accuracy,storage,preparationTime,predictionTime\n")

    # SVM parameters to vary
    C_values = [0.1, 1.0, 10.0]  # Regularization parameter
    kernel_types = ["linear", "rbf", "poly", "sigmoid"]  # Kernel types
    dim_values = [0, 100, 200]  # Dimensionality reduction
    eps_values = [0.1, 0.3, 0.5]  # JL epsilon

    best_score = -1
    best_params = None

    # Grid search for hyperparameter tuning
    for C in C_values:
        for kernel in kernel_types:
            for dim in dim_values:
                for eps in eps_values:
                    print(f"Testing: kernel={kernel}, C={C}, dim={dim}, eps={eps}")
                    results = flastSVM(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, C, kernel)
                    avg_accuracy = results[6]  # Assuming accuracy is at index 6 in the returned results

                    # Check if this configuration has the best accuracy so far
                    if avg_accuracy > best_score:
                        best_score = avg_accuracy
                        best_params = (kernel, C, dim, eps)

                    print(f"Results: kernel={kernel}, C={C}, dim={dim}, eps={eps}, Accuracy={avg_accuracy}")
                    with open(os.path.join(outDir, outFile), "a") as fo:
                        fo.write("{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format(kernel, C, dim, eps, *results))

    print("SVM analysis completed. Results saved to:", outFile)
    print(f"Best parameters found: kernel={best_params[0]}, C={best_params[1]}, dim={best_params[2]}, eps={best_params[3]}")
    print(f"Best accuracy: {best_score}")


Testing: kernel=linear, C=0.1, dim=0, eps=0.1
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticmethod
    @

NB

In [5]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    precision = precision_score(testLabels, predictLabels, zero_division=0)
    recall = recall_score(testLabels, predictLabels, zero_division=0)
    accuracy = accuracy_score(testLabels, predictLabels)
    return precision, recall, accuracy

###############################################################################
# FLAST functions

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def flastNBClassification(trainData, trainLabels, testData):
    trainData = np.abs(trainData)
    testData = np.abs(testData)

    t0 = time.perf_counter()
    nb = MultinomialNB()
    nb.fit(trainData, trainLabels)
    trainTime = time.perf_counter() - t0

    t0 = time.perf_counter()
    predictLabels = nb.predict(testData)
    testTime = time.perf_counter() - t0

    return trainTime, testTime, predictLabels

###############################################################################
# FLAST Naive Bayes analysis

def flastNB(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Storage calculation
    nb = (dataPointsList, dataLabelsList)
    pickleDumpNB = os.path.join(outDir, f"flast-nb-dim{dim}-eps{eps}.pickle")
    with open(pickleDumpNB, "wb") as pickleFile:
        pickle.dump(nb, pickleFile)
    storage = os.path.getsize(pickleDumpNB)
    os.remove(pickleDumpNB)

    avgP, avgR, avgA = 0, 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0

    for trnIdx, tstIdx in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold with no flaky/non-flaky examples...")
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        trainData = trainData.reshape((trainData.shape[0], -1))
        testData = testData.reshape((testData.shape[0], -1))

        trainTime, testTime, predictLabels = flastNBClassification(trainData, trainLabels, testData)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        precision, recall, accuracy = computeResults(testLabels, predictLabels)

        print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")
        if precision != "-":
            precisionFold += 1
            avgP += precision
            avgA += accuracy
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold > 0:
        avgP /= precisionFold
        avgA /= precisionFold
    else:
        avgP = avgA = "-"
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, avgA, storage, avgTPrep, avgTPred

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/cleaned_flaky_files.zip"
    nonFlakyZip = "C:/Users/kdeep/Downloads/Flakiness ML/reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results-nb/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 30
    kf = StratifiedKFold(n_splits=numSplit)

    outFile = "params-nb.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Updated header to match the number of parameters being recorded
        fo.write("dim,eps,avgFlakyTrain,avgNonFlakyTrain,avgFlakyTest,avgNonFlakyTest,precision,recall,accuracy,storage,preparationTime,predictionTime\n")

    # Naive Bayes parameters to vary
    dim_values = [0, 100, 200]  # Dimensionality reduction
    eps_values = [0.1, 0.3, 0.5]  # JL epsilon

    best_score = 0
    best_params = None

    for dim in dim_values:
        for eps in eps_values:
            print(f"Testing: dim={dim}, eps={eps}")
            results = flastNB(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps)
            # Flatten the results tuple
            avgP, avgR, avgA = results[4], results[5], results[6]
            # Determine the best parameters based on the average accuracy or precision
            if avgA != '-' and avgA > best_score:
                best_score = avgA
                best_params = (dim, eps)

            # Write results to the CSV file
            with open(os.path.join(outDir, outFile), "a") as fo:
                fo.write("{},{},{},{},{},{},{},{},{},{},{},{}\n".format(dim, eps, *results))

    print("Naive Bayes analysis completed. Results saved to:", outFile)
    if best_params:
        print(f"Best params based on accuracy: dim={best_params[0]}, eps={best_params[1]} with score={best_score}")
    else:
        print("No valid results were found to determine the best parameters.")


Testing: dim=0, eps=0.1
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticmethod
    @abc.abstractmethod
   