In [5]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    try:
        precision = precision_score(testLabels, predictLabels)
    except:
        precision = "-"
    try:
        recall = recall_score(testLabels, predictLabels)
    except:
        recall = "-"
    return precision, recall

###############################################################################
# FLAST functions

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def flastClassification(trainData, trainLabels, testData, sigma, k, params):
    kNN = KNeighborsClassifier(
        algorithm=params["algorithm"],
        metric=params["metric"],
        weights=params["weights"],
        n_neighbors=k,
        n_jobs=1
    )
    t0 = time.perf_counter()
    kNN.fit(trainData, trainLabels)
    trainTime = time.perf_counter() - t0

    t0 = time.perf_counter()
    neighborDist, neighborInd = kNN.kneighbors(testData)
    predictLabels = []
    
    for distances, indices in zip(neighborDist, neighborInd):
        phi, psi = 0, 0
        for distance, neighbor in zip(distances, indices):
            dInv = 1 / distance if distance != 0 else float("Inf")
            if trainLabels[neighbor] == 1:
                phi += dInv
            else:
                psi += dInv

        # Prediction logic
        if phi == float("Inf") and psi == float("Inf"):
            prediction = 0
        elif psi == float("Inf"):
            prediction = 0
        elif phi == float("Inf"):
            prediction = 1
        elif (phi + psi) == 0:
            prediction = 0
        else:
            prediction = 1 if phi / (phi + psi) >= sigma else 0
        predictLabels.append(prediction)

    testTime = time.perf_counter() - t0
    return trainTime, testTime, predictLabels

###############################################################################
# FLAST KNN analysis

def flastKNN(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, k, sigma, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Storage calculation
    kNN = (dataPointsList, dataLabelsList)
    pickleDumpKNN = os.path.join(outDir, f"flast-k{k}-sigma{sigma}.pickle")
    with open(pickleDumpKNN, "wb") as pickleFile:
        pickle.dump(kNN, pickleFile)
    storage = os.path.getsize(pickleDumpKNN)
    os.remove(pickleDumpKNN)

    avgP, avgR = 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0

    for trnIdx, tstIdx in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold with no flaky/non-flaky examples...")
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        trainData = trainData.reshape((trainData.shape[0], -1))
        testData = testData.reshape((testData.shape[0], -1))

        trainTime, testTime, predictLabels = flastClassification(trainData, trainLabels, testData, sigma, k, params)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        precision, recall = computeResults(testLabels, predictLabels)

        print(f"Precision: {precision}, Recall: {recall}")
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold > 0:
        avgP /= precisionFold
    else:
        avgP = "-"
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "flaky_methods.zip"
    nonFlakyZip = "non-flakyMethods.zip"
    extractDir = "extracted"
    outDir = "results/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    outFile = "params-knn.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("distance,k,sigma,eps,precision,recall,storage,preparationTime,predictionTime\n")

    # KNN parameters
    k = 7
    sigma = 0.5
    dim = 0  # Number of dimensions (0: JL with error eps)
    eps = 0.3  # JL epsilon
    params = {"algorithm": "brute", "metric": "cosine", "weights": "uniform"}

    for metric in ["cosine", "euclidean"]:
        for k in [3, 7]:
            print(f"{metric=}, {k=}")
            params["metric"] = metric
            results = flastKNN(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, k, sigma, params)
            with open(os.path.join(outDir, outFile), "a") as fo:
                fo.write("{},{},{},{},{},{},{},{},{}\n".format(params["metric"], k, sigma, eps, *results))

    print("KNN analysis completed. Results saved to:", outFile)


metric='cosine', k=3
Number of documents: 301
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticmethod
    @abc.abstractmethod
    def is_my_module(module):
        pass

    @staticmethod
    @abc.abst

  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.6666666666666666, Recall: 0.2
Precision: 1.0, Recall: 0.4
Precision: 1.0, Recall: 0.3
Precision: 0.3333333333333333, Recall: 0.1
Precision: 0.4, Recall: 0.2
Precision: 1.0, Recall: 0.2
Precision: 1.0, Recall: 0.4
Precision: 1.0, Recall: 0.2
Precision: 1.0, Recall: 0.5
Precision: 1.0, Recall: 0.2
Precision: 1.0, Recall: 0.6
Precision: 0.75, Recall: 0.3
Precision: 1.0, Recall: 0.1
Precision: 0.6, Recall: 0.3
Precision: 0.75, Recall: 0.3
Precision: 1.0, Recall: 0.3
metric='euclidean', k=3
Number of documents: 301
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR COND

  _warn_prf(average, modifier, msg_start, len(result))


Precision: 1.0, Recall: 0.3
Precision: 0.8333333333333334, Recall: 0.5
Precision: 0.6666666666666666, Recall: 0.2
Precision: 0.0, Recall: 0.0
Precision: 1.0, Recall: 0.1
Precision: 0.3333333333333333, Recall: 0.2
Precision: 1.0, Recall: 0.3
Precision: 1.0, Recall: 0.3
Precision: 0.3333333333333333, Recall: 0.1
Precision: 0.6666666666666666, Recall: 0.4
Precision: 1.0, Recall: 0.1
Precision: 1.0, Recall: 0.3
Precision: 0.0, Recall: 0.0
Precision: 1.0, Recall: 0.1
Precision: 1.0, Recall: 0.1
Precision: 1.0, Recall: 0.2
Precision: 1.0, Recall: 0.2
Precision: 1.0, Recall: 0.2
Precision: 1.0, Recall: 0.2
Precision: 1.0, Recall: 0.2
Precision: 0.6666666666666666, Recall: 0.2
Precision: 1.0, Recall: 0.2
Precision: 0.0, Recall: 0.0
Precision: 0.6666666666666666, Recall: 0.2
Precision: 1.0, Recall: 0.1
Precision: 1.0, Recall: 0.2
KNN analysis completed. Results saved to: params-knn.csv


SVM

In [7]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    precision = precision_score(testLabels, predictLabels, zero_division=0)
    recall = recall_score(testLabels, predictLabels, zero_division=0)
    return precision, recall

###############################################################################
# FLAST functions

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def flastSVMClassification(trainData, trainLabels, testData, C, kernel, params):
    t0 = time.perf_counter()
    svm = SVC(C=C, kernel=kernel)
    svm.fit(trainData, trainLabels)
    trainTime = time.perf_counter() - t0

    t0 = time.perf_counter()
    predictLabels = svm.predict(testData)
    testTime = time.perf_counter() - t0

    return trainTime, testTime, predictLabels

###############################################################################
# FLAST SVM analysis

def flastSVM(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, C, kernel, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Storage calculation
    svm = (dataPointsList, dataLabelsList)
    pickleDumpSVM = os.path.join(outDir, f"flast-svm-C{C}-kernel{kernel}.pickle")
    with open(pickleDumpSVM, "wb") as pickleFile:
        pickle.dump(svm, pickleFile)
    storage = os.path.getsize(pickleDumpSVM)
    os.remove(pickleDumpSVM)

    avgP, avgR = 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0

    for trnIdx, tstIdx in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold with no flaky/non-flaky examples...")
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        trainData = trainData.reshape((trainData.shape[0], -1))
        testData = testData.reshape((testData.shape[0], -1))

        trainTime, testTime, predictLabels = flastSVMClassification(trainData, trainLabels, testData, C, kernel, params)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        precision, recall = computeResults(testLabels, predictLabels)

        print(f"Precision: {precision}, Recall: {recall}")
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold > 0:
        avgP /= precisionFold
    else:
        avgP = "-"
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "flaky_methods.zip"
    nonFlakyZip = "non-flakyMethods.zip"
    extractDir = "extracted"
    outDir = "results-svm/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    outFile = "params-svm.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("kernel,C,sigma,eps,precision,recall,storage,preparationTime,predictionTime\n")

    # SVM parameters to vary
    C_values = [0.1, 1.0, 10.0]  # Regularization parameter
    kernel_types = ["linear", "rbf", "poly", "sigmoid"]  # Kernel types
    dim_values = [0, 100, 200]  # Dimensionality reduction
    eps_values = [0.1, 0.3, 0.5]  # JL epsilon

    for C in C_values:
        for kernel in kernel_types:
            for dim in dim_values:
                for eps in eps_values:
                    print(f"{kernel=}, {C=}, {dim=}, {eps=}")
                    results = flastSVM(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps, C, kernel, {})
                    with open(os.path.join(outDir, outFile), "a") as fo:
                        fo.write("{},{},{},{},{},{},{},{},{}\n".format(kernel, C, dim, eps, *results))

    print("SVM analysis completed. Results saved to:", outFile)


kernel='linear', C=0.1, dim=0, eps=0.1
Number of documents: 301
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticmethod
    @abc.abstractmethod
    def is_my_module(module):
        pass

    @staticme

In [8]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    precision = precision_score(testLabels, predictLabels, zero_division=0)
    recall = recall_score(testLabels, predictLabels, zero_division=0)
    return precision, recall

###############################################################################
# FLAST functions

def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def flastNBClassification(trainData, trainLabels, testData):
    trainData = np.abs(trainData)
    testData = np.abs(testData)

    t0 = time.perf_counter()
    nb = MultinomialNB()
    nb.fit(trainData, trainLabels)
    trainTime = time.perf_counter() - t0

    t0 = time.perf_counter()
    predictLabels = nb.predict(testData)
    testTime = time.perf_counter() - t0

    return trainTime, testTime, predictLabels

###############################################################################
# FLAST Naive Bayes analysis

def flastNB(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of documents: {len(dataPoints)}")
    if len(dataPoints) > 0:
        print(f"Sample document: {dataPoints[0]}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Storage calculation
    nb = (dataPointsList, dataLabelsList)
    pickleDumpNB = os.path.join(outDir, f"flast-nb-dim{dim}-eps{eps}.pickle")
    with open(pickleDumpNB, "wb") as pickleFile:
        pickle.dump(nb, pickleFile)
    storage = os.path.getsize(pickleDumpNB)
    os.remove(pickleDumpNB)

    avgP, avgR = 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0

    for trnIdx, tstIdx in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold with no flaky/non-flaky examples...")
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        trainData = trainData.reshape((trainData.shape[0], -1))
        testData = testData.reshape((testData.shape[0], -1))

        trainTime, testTime, predictLabels = flastNBClassification(trainData, trainLabels, testData)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        precision, recall = computeResults(testLabels, predictLabels)

        print(f"Precision: {precision}, Recall: {recall}")
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold > 0:
        avgP /= precisionFold
    else:
        avgP = "-"
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "flaky_methods.zip"
    nonFlakyZip = "non-flakyMethods.zip"
    extractDir = "extracted"
    outDir = "results-nb/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    outFile = "params-nb.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("dim,eps,precision,recall,storage,preparationTime,predictionTime\n")

    # Naive Bayes parameters to vary
    dim_values = [0, 100, 200]  # Dimensionality reduction
    eps_values = [0.1, 0.3, 0.5]  # JL epsilon

    for dim in dim_values:
        for eps in eps_values:
            print(f"dim={dim}, eps={eps}")
            results = flastNB(outDir, flakyZip, nonFlakyZip, extractDir, kf, dim, eps)
            with open(os.path.join(outDir, outFile), "a") as fo:
                fo.write("{},{},{},{},{},{},{}\n".format(dim, eps, *results))

    print("Naive Bayes analysis completed. Results saved to:", outFile)


dim=0, eps=0.1
Number of documents: 301
Sample document: # Copyright 2021 The NetKet Authors - All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import abc

from flax.core import freeze


@dataclasses.dataclass(frozen=True)
class ModuleFramework(abc.ABC):
    @staticmethod
    @abc.abstractmethod
    def is_loaded() -> bool:
        pass

    @staticmethod
    @abc.abstractmethod
    def is_my_module(module):
        pass

    @staticmethod
    @abc.abstractme