In [3]:
import os
import time
import warnings
import pickle
import numpy as np
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score,f1_score,matthews_corrcoef
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

###############################################################################
# read data from file

def getDataPoints(path):
    dataPointsList = []
    for dataPointName in os.listdir(path):
        if dataPointName[0] == ".":
            continue
        filePath = os.path.join(path, dataPointName)
        print(f"Attempting to open file: {filePath}")
        if not os.path.exists(filePath):
            print(f"File does not exist: {filePath}")
            continue
        with open(filePath, encoding="utf-8") as fileIn:
            dp = fileIn.read()
        dataPointsList.append(dp)
    return dataPointsList

def getDataPointsInfo(projectBasePath, projectName):
    # get list of tokenized test methods
    projectPath = os.path.join(projectBasePath, projectName)
    flakyPath = os.path.join(projectPath, "flakyMethods")
    nonFlakyPath = os.path.join(projectPath, "nonflakyMethods")  # Updated path
    return getDataPoints(flakyPath), getDataPoints(nonFlakyPath)

# Example usage
projectBasePath = r"dataset"
projectName = "project"  # Replace with the actual project name

flakyMethods, nonFlakyMethods = getDataPointsInfo(projectBasePath, projectName)

print("Flaky Methods:")
for method in flakyMethods:
    print(method)

print("\nNon-Flaky Methods:")
for method in nonFlakyMethods:
    print(method)

###############################################################################
# compute effectiveness metrics

def computeResults(testLabels, predictLabels):
    warnings.filterwarnings("error")  # to catch warnings, e.g., "prec set to 0.0"
    try:
        precision = precision_score(testLabels, predictLabels)
    except:
        precision = "-"
    try:
        recall = recall_score(testLabels, predictLabels)
    except:
        recall = "-"
    warnings.resetwarnings()  # warnings are no more errors
    return precision, recall

###############################################################################
# FLAST

def vectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer()
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def classificationRandomForest(trainData, trainLabels, testData, params):
    # training
    t0 = time.perf_counter()
    clf = RandomForestClassifier(
        n_estimators=params.get("n_estimators", 100),
        criterion=params.get("criterion", "gini"),
        max_depth=params.get("max_depth"),
        min_samples_split=params.get("min_samples_split", 2),
        min_samples_leaf=params.get("min_samples_leaf", 1),
        n_jobs=params.get("n_jobs", -1),
        random_state=params.get("random_state", 42)
    )
    clf.fit(trainData, trainLabels)
    t1 = time.perf_counter()
    trainTime = t1 - t0

    t0 = time.perf_counter()
    predictLabels = clf.predict(testData)
    t1 = time.perf_counter()
    testTime = t1 - t0

    return trainTime, testTime, predictLabels

def randomForest(outDir, projectBasePath, projectName, kf, dim, eps, params):
    v0 = time.perf_counter()
    dataPointsFlaky, dataPointsNonFlaky = getDataPointsInfo(projectBasePath, projectName)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky
    Z = vectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    v1 = time.perf_counter()
    vecTime = v1 - v0

    # storage
    randomForest = (dataPointsList, dataLabelsList)
    pickleDumpRandomForest = os.path.join(outDir, "random-forest.pickle")
    with open(pickleDumpRandomForest, "wb") as pickleFile:
        pickle.dump(randomForest, pickleFile)
    storage = os.path.getsize(pickleDumpRandomForest)
    os.remove(pickleDumpRandomForest)

    avgP, avgR = 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0
    for (trnIdx, tstIdx) in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold...")
            print(" Flaky Train Tests", sum(trainLabels))
            print(" Flaky Test Tests", sum(testLabels))
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        # prepare the data in the right format for Random Forest
        nSamplesTrainData, nxTrain, nyTrain = trainData.shape
        trainData = trainData.reshape((nSamplesTrainData, nxTrain * nyTrain))
        nSamplesTestData, nxTest, nyTest = testData.shape
        testData = testData.reshape((nSamplesTestData, nxTest * nyTest))

        trainTime, testTime, predictLabels = classificationRandomForest(trainData, trainLabels, testData, params)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        (precision, recall) = computeResults(testLabels, predictLabels)

        print(precision, recall)
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold == 0:
        avgP = "-"
    else:
        avgP /= precisionFold
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return (avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred)

if __name__ == "__main__":
    projectBasePath = ""
    projectList = [
        "project"  # Replace with the actual project name
    ]
    outDir = "results-RandomForest"
    outFile = "result_Random_Forest.csv"
    os.makedirs(outDir, exist_ok=True)
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("dataset,flakyTrain,nonFlakyTrain,flakyTest,nonFlakyTest,precision,recall,storage,preparationTime,f1, ,predictionTime\n")

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    # FLAST
    dim = 0  # number of dimensions (0: JL with error eps)
    eps = 0.3  # JL eps
    params = {
        "criterion": "entropy"
        "criterion": "gini",
        "max_depth": 300,
        "min_samples_split": 5,
        "min_samples_leaf": 1
    }
    for projectName in projectList:
        print(projectName.upper(), "FLAST")
        (flakyTrain, nonFlakyTrain, flakyTest, nonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred) = randomForest(outDir, projectBasePath, projectName, kf, dim, eps, params)
        with open(os.path.join(outDir, outFile), "a") as fo:
            fo.write("{},{},{},{},{},{},{},{},{},{}\n".format(projectName, flakyTrain, nonFlakyTrain, flakyTest, nonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred))

            
            


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Attempting to open file: dataset\project\nonflakyMethods\Qiskit_qiskit_PR3307_0a45dea238c813ade4e9421a553f86f1bbe5d068_qiskit_pulse_reschedule.txt
Attempting to open file: dataset\project\nonflakyMethods\Qiskit_qiskit_PR3307_0a45dea238c813ade4e9421a553f86f1bbe5d068_test_python_pulse_test_reschedule.txt
Attempting to open file: dataset\project\nonflakyMethods\Qiskit_qiskit_PR3348_d3bf2374bbe5f2abdf19d97a078bb3bdcfe7a07a_qiskit_quantum_info_synthesis_one_qubit_decompose.txt
Attempting to open file: dataset\project\nonflakyMethods\Qiskit_qiskit_PR3348_d3bf2374bbe5f2abdf19d97a078bb3bdcfe7a07a_qiskit_transpiler_passes_ms_basis_decomposer.txt
Attempting to open file: dataset\project\nonflakyMethods\Qiskit_qiskit_PR338_9364c2b98cfdda74993df973b030db9bb006477d_qiskit_dagcircuit__dagcircuit.txt
Attempting to open file: dataset\project\nonflakyMethods\Qiskit_qiskit_PR338_9364c2b98cfdda74993df973b030db9bb006477d_qiskit_extensions_standard_barrier.txt
Attempting to open file: dataset\project\nonfl

1.0 0.6111111111111112
1.0 0.4444444444444444
1.0 0.5555555555555556
0.9 0.5
0.875 0.3888888888888889
1.0 0.5
1.0 0.4444444444444444
1.0 0.6111111111111112
1.0 0.5
1.0 0.4444444444444444
1.0 0.5555555555555556
1.0 0.5555555555555556
0.9 0.5
1.0 0.3888888888888889
1.0 0.7222222222222222
1.0 0.5555555555555556
1.0 0.5
1.0 0.4444444444444444
1.0 0.3888888888888889
1.0 0.5555555555555556
1.0 0.5
0.8888888888888888 0.4444444444444444
1.0 0.3888888888888889
0.9 0.5
0.8888888888888888 0.4444444444444444
1.0 0.6666666666666666
1.0 0.3333333333333333
1.0 0.1111111111111111
0.9 0.5
1.0 0.3333333333333333


In [None]:
import os
import time
import zipfile
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Random Forest with GridSearchCV and Multiple Scoring Metrics including MCC

def flastRFWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define Random Forest model
    rf_model = RandomForestClassifier()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [10, 50, 100, 300, 500],  # Number of trees
        'max_depth': [10, 30, 50, 100, 300, 500],  # Maximum depth of each tree
        'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2],  # Minimum number of samples required to be at a leaf node
        "criterion": ["gini", "entropy"],  # Function to measure the quality of a split
    }

    # Custom scoring functions including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),  
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(rf_model, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-rf-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_estimators,max_depth,min_samples_split,min_samples_leaf,criterion,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = vecTime / len(dataPoints)  
            fo.write(f"{param['n_estimators']},{param['max_depth']},{param['min_samples_split']},{param['min_samples_leaf']},{param['criterion']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Random Forest analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Random Forest analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastRFWithGridSearchCV(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal")
    best_params_3folds_1, best_score_3folds_1 = flastRFWithGridSearchCV(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal")

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform Random Forest analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Random Forest analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastRFWithGridSearchCV(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger")
    best_params_3folds_2, best_score_3folds_2 = flastRFWithGridSearchCV(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger")

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")