In [15]:
import os
import time
import warnings
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

###############################################################################
# read data from file

def getDataPoints(path):
    dataPointsList = []
    for dataPointName in os.listdir(path):
        if dataPointName[0] == ".":
            continue
        filePath = os.path.join(path, dataPointName)
        print(f"Attempting to open file: {filePath}")
        if not os.path.exists(filePath):
            print(f"File does not exist: {filePath}")
            continue
        with open(filePath, encoding="utf-8") as fileIn:
            dp = fileIn.read()
        dataPointsList.append(dp)
    return dataPointsList

def getDataPointsInfo(projectBasePath, projectName):
    # get list of tokenized test methods
    projectPath = os.path.join(projectBasePath, projectName)
    flakyPath = os.path.join(projectPath, "flakyMethods")
    nonFlakyPath = os.path.join(projectPath, "nonflakyMethods")  # Updated path
    return getDataPoints(flakyPath), getDataPoints(nonFlakyPath)

# Example usage
projectBasePath = r"dataset"
projectName = "project"  # Replace with the actual project name

flakyMethods, nonFlakyMethods = getDataPointsInfo(projectBasePath, projectName)

print("Flaky Methods:")
for method in flakyMethods:
    print(method)

print("\nNon-Flaky Methods:")
for method in nonFlakyMethods:
    print(method)

###############################################################################
# compute effectiveness metrics

def computeResults(testLabels, predictLabels):
    warnings.filterwarnings("error")  # to catch warnings, e.g., "prec set to 0.0"
    try:
        precision = precision_score(testLabels, predictLabels)
    except:
        precision = "-"
    try:
        recall = recall_score(testLabels, predictLabels)
    except:
        recall = "-"
    warnings.resetwarnings()  # warnings are no more errors
    return precision, recall

###############################################################################
# FLAST

def vectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # Disable stop words filtering
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def classificationDecisionTree(trainData, trainLabels, testData, params):
    # training
    t0 = time.perf_counter()
    clf = DecisionTreeClassifier(
        criterion=params.get("criterion", "gini"),
        splitter=params.get("splitter", "best"),
        max_depth=params.get("max_depth"),
        min_samples_split=params.get("min_samples_split", 2),
        min_samples_leaf=params.get("min_samples_leaf", 1),
    )
    clf.fit(trainData, trainLabels)
    t1 = time.perf_counter()
    trainTime = t1 - t0

    t0 = time.perf_counter()
    predictLabels = clf.predict(testData)
    t1 = time.perf_counter()
    testTime = t1 - t0

    return trainTime, testTime, predictLabels

def decisionTree(outDir, projectBasePath, projectName, kf, dim, eps, params):
    v0 = time.perf_counter()
    dataPointsFlaky, dataPointsNonFlaky = getDataPointsInfo(projectBasePath, projectName)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky
    print("Data points before vectorization:", dataPoints) 
    Z = vectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    v1 = time.perf_counter()
    vecTime = v1 - v0

    # storage
    decisionTree = (dataPointsList, dataLabelsList)
    pickleDumpDecisionTree = os.path.join(outDir, "decision-tree.pickle")
    with open(pickleDumpDecisionTree, "wb") as pickleFile:
        pickle.dump(decisionTree, pickleFile)
    storage = os.path.getsize(pickleDumpDecisionTree)
    os.remove(pickleDumpDecisionTree)

    avgP, avgR = 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0
    for (trnIdx, tstIdx) in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold...")
            print(" Flaky Train Tests", sum(trainLabels))
            print(" Flaky Test Tests", sum(testLabels))
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        # prepare the data in the right format for Decision Tree
        nSamplesTrainData, nxTrain, nyTrain = trainData.shape
        trainData = trainData.reshape((nSamplesTrainData, nxTrain * nyTrain))
        nSamplesTestData, nxTest, nyTest = testData.shape
        testData = testData.reshape((nSamplesTestData, nxTest * nyTest))

        trainTime, testTime, predictLabels = classificationDecisionTree(trainData, trainLabels, testData, params)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        (precision, recall) = computeResults(testLabels, predictLabels)

        print(precision, recall)
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold == 0:
        avgP = "-"
    else:
        avgP /= precisionFold
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return (avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred)

if __name__ == "__main__":
    projectBasePath = "dataset"
    projectList = [
        "project"  # Replace with the actual project name
    ]
    outDir = "results-DecisionTree"
    outFile = "result_Decision_Tree.csv"
    os.makedirs(outDir, exist_ok=True)
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("dataset,flakyTrain,nonFlakyTrain,flakyTest,nonFlakyTest,precision,recall,storage,preparationTime,predictionTime\n")

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    # FLAST
    dim = 0  # number of dimensions (0: JL with error eps)
    eps = 0.3  # JL eps
    params = {
        "criterion": "gini",
        "splitter": "best",
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1
    }
    for projectName in projectList:
        print(projectName.upper(), "FLAST")
        (flakyTrain, nonFlakyTrain, flakyTest, nonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred) = decisionTree(outDir, projectBasePath, projectName, kf, dim, eps, params)
        with open(os.path.join(outDir, outFile), "a") as fo:
            fo.write("{},{},{},{},{},{},{},{},{},{}\n".format(projectName, flakyTrain, nonFlakyTrain, flakyTest, nonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred))


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



0.7777777777777778 0.7777777777777778
0.6842105263157895 0.7222222222222222
0.75 0.8333333333333334
0.6 0.6666666666666666
0.7 0.7777777777777778
0.6875 0.6111111111111112
0.625 0.5555555555555556
0.6 0.5
0.7333333333333333 0.6111111111111112
0.6521739130434783 0.8333333333333334
0.9166666666666666 0.6111111111111112
0.65 0.7222222222222222
0.7 0.7777777777777778
0.56 0.7777777777777778
0.5416666666666666 0.7222222222222222
0.6086956521739131 0.7777777777777778
0.7368421052631579 0.7777777777777778
0.5882352941176471 0.5555555555555556
0.7272727272727273 0.4444444444444444
0.7 0.7777777777777778
0.8 0.6666666666666666
0.631578947368421 0.6666666666666666
0.7 0.7777777777777778
0.6111111111111112 0.6111111111111112
0.6923076923076923 0.5
0.6875 0.6111111111111112
0.631578947368421 0.6666666666666666
0.5454545454545454 0.3333333333333333
0.7142857142857143 0.5555555555555556
0.75 0.6666666666666666
