In [4]:
import os
import time
import warnings
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

###############################################################################
# read data from file

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    try:
        precision = precision_score(testLabels, predictLabels)
        recall = recall_score(testLabels, predictLabels)
        accuracy = accuracy_score(testLabels, predictLabels)
        print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")
    except Exception as e:
        print(f"Error computing metrics: {e}")
        precision = recall = accuracy = "-"
    return precision, recall, accuracy

###############################################################################
# FLAST

def vectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # Disable stop words filtering
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def classificationDecisionTree(trainData, trainLabels, testData, params):
    # training
    t0 = time.perf_counter()
    clf = DecisionTreeClassifier(
        criterion=params.get("criterion", "gini"),
        splitter=params.get("splitter", "best"),
        max_depth=params.get("max_depth"),
        min_samples_split=params.get("min_samples_split", 2),
        min_samples_leaf=params.get("min_samples_leaf", 1),
    )
    clf.fit(trainData, trainLabels)
    t1 = time.perf_counter()
    trainTime = t1 - t0

    t0 = time.perf_counter()
    predictLabels = clf.predict(testData)
    t1 = time.perf_counter()
    testTime = t1 - t0

    return trainTime, testTime, predictLabels

def decisionTree(outDir, projectBasePath, projectName, kf, dim, eps, params):
    v0 = time.perf_counter()
    
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky
    print("Data points before vectorization:", dataPoints) 
    Z = vectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    v1 = time.perf_counter()
    vecTime = v1 - v0

    # storage
    decisionTree = (dataPointsList, dataLabelsList)
    pickleDumpDecisionTree = os.path.join(outDir, "decision-tree.pickle")
    with open(pickleDumpDecisionTree, "wb") as pickleFile:
        pickle.dump(decisionTree, pickleFile)
    storage = os.path.getsize(pickleDumpDecisionTree)
    os.remove(pickleDumpDecisionTree)

    avgP, avgR, avgAccuracy = 0, 0, 0
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0
    for (trnIdx, tstIdx) in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold...")
            print(" Flaky Train Tests", sum(trainLabels))
            print(" Flaky Test Tests", sum(testLabels))
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        # prepare the data in the right format for Decision Tree
        nSamplesTrainData, nxTrain, nyTrain = trainData.shape
        trainData = trainData.reshape((nSamplesTrainData, nxTrain * nyTrain))
        nSamplesTestData, nxTest, nyTest = testData.shape
        testData = testData.reshape((nSamplesTestData, nxTest * nyTest))

        trainTime, testTime, predictLabels = classificationDecisionTree(trainData, trainLabels, testData, params)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        precision, recall, accuracy = computeResults(testLabels, predictLabels)

        print(precision, recall, accuracy)
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgAccuracy += accuracy
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold == 0:
        avgP = "-"
    else:
        avgP /= precisionFold
    avgR /= successFold
    avgAccuracy /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return (avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, avgAccuracy, storage, avgTPrep, avgTPred)

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    nonFlakyZip = "reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results-DecisionTree/"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    outFile = "params-DecisionTree.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Updated the header to match the number of columns
        fo.write("criterion,splitter,max_depth,min_samples_split,min_samples_leaf,avgFlakyTrain,avgNonFlakyTrain,avgFlakyTest,avgNonFlakyTest,precision,recall,accuracy,storage,preparationTime,predictionTime\n")

    params = {
        "criterion": "gini",
        "splitter": "best",
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1
    }

    try:
        results = decisionTree(outDir, extractDir, "projectName", kf, dim, eps, params)
        
        # Flatten the results tuple and write to CSV
        results_flat = ','.join(map(str, results))
        print(f"Results to be written: {params['criterion']},{params['splitter']},{params['max_depth']},{params['min_samples_split']},{params['min_samples_leaf']},{results_flat}")
        
        with open(os.path.join(outDir, outFile), "a") as fo:
            fo.write(f"{params['criterion']},{params['splitter']},{params['max_depth']},{params['min_samples_split']},{params['min_samples_leaf']},{results_flat}\n")
    except Exception as e:
        print(f"Failed to complete Decision Tree analysis: {e}")

    print("Decision Tree analysis completed. Results saved to:", outFile)


Failed to complete Decision Tree analysis: name 'flakyDir' is not defined
Decision Tree analysis completed. Results saved to: params-DecisionTree.csv
