In [1]:
import os
import time
import warnings
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.exceptions import UndefinedMetricWarning
from itertools import product

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

###############################################################################
# read data from file

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList


def computeResults(testLabels, predictLabels):
    try:
        precision = precision_score(testLabels, predictLabels)
        recall = recall_score(testLabels, predictLabels)
        accuracy = accuracy_score(testLabels, predictLabels)
        f1 = f1_score(testLabels, predictLabels)
        print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}, F1 Score: {f1}")
    except Exception as e:
        print(f"Error computing metrics: {e}")
        precision = recall = accuracy = f1 = "-"
    return precision, recall, accuracy, f1

###############################################################################
# FLAST

def vectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer()
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def classificationRandomForest(trainData, trainLabels, testData, params):
    # training
    t0 = time.perf_counter()
    clf = RandomForestClassifier(
        n_estimators=params.get("n_estimators", 100),
        criterion=params.get("criterion", "gini"),
        max_depth=params.get("max_depth"),
        min_samples_split=params.get("min_samples_split", 2),
        min_samples_leaf=params.get("min_samples_leaf", 1),
        n_jobs=params.get("n_jobs", -1),
        random_state=params.get("random_state", 42)
    )
    clf.fit(trainData, trainLabels)
    t1 = time.perf_counter()
    trainTime = t1 - t0

    t0 = time.perf_counter()
    predictLabels = clf.predict(testData)
    t1 = time.perf_counter()
    testTime = t1 - t0

    return trainTime, testTime, predictLabels

def randomForest(outDir, projectBasePath, projectName, kf, dim, eps, params):
    v0 = time.perf_counter()
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky
    print("Data points before vectorization:",len(dataPoints))
    Z = vectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    v1 = time.perf_counter()
    vecTime = v1 - v0

    # storage
    randomForest = (dataPointsList, dataLabelsList)
    pickleDumpRandomForest = os.path.join(outDir, "random-forest.pickle")
    with open(pickleDumpRandomForest, "wb") as pickleFile:
        pickle.dump(randomForest, pickleFile)
    storage = os.path.getsize(pickleDumpRandomForest)
    os.remove(pickleDumpRandomForest)

    avgP, avgR, avgAccuracy, avgF1 = 0, 0, 0, 0  # Add avgF1
    avgTPrep, avgTPred = 0, 0
    avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest = 0, 0, 0, 0
    successFold, precisionFold = 0, 0
    for (trnIdx, tstIdx) in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold...")
            print(" Flaky Train Tests", sum(trainLabels))
            print(" Flaky Test Tests", sum(testLabels))
            continue

        successFold += 1
        avgFlakyTrain += sum(trainLabels)
        avgNonFlakyTrain += len(trainLabels) - sum(trainLabels)
        avgFlakyTest += sum(testLabels)
        avgNonFlakyTest += len(testLabels) - sum(testLabels)

        # prepare the data in the right format for Random Forest
        nSamplesTrainData, nxTrain, nyTrain = trainData.shape
        trainData = trainData.reshape((nSamplesTrainData, nxTrain * nyTrain))
        nSamplesTestData, nxTest, nyTest = testData.shape
        testData = testData.reshape((nSamplesTestData, nxTest * nyTest))

        trainTime, testTime, predictLabels = classificationRandomForest(trainData, trainLabels, testData, params)
        preparationTime = (vecTime * len(trainData) / len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime / len(testData))
        precision, recall, accuracy, f1 = computeResults(testLabels, predictLabels)

        print(precision, recall, accuracy, f1)  # Add F1 score to print statement
        if precision != "-":
            precisionFold += 1
            avgP += precision
            avgF1 += f1  # Aggregate F1 score
        avgR += recall
        avgAccuracy += accuracy
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold == 0:
        avgP = avgF1 = "-"  # Adjust for F1
    else:
        avgP /= precisionFold
        avgF1 /= precisionFold  # Average F1 score
    avgR /= successFold
    avgAccuracy /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold
    avgFlakyTrain /= successFold
    avgNonFlakyTrain /= successFold
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return (avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, avgAccuracy, avgF1, storage, avgTPrep, avgTPred)


if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    nonFlakyZip = "reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results_Random_Forest"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    outFile = "params-random-forest.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Updated the header to match the number of columns
        fo.write("n_estimators,criterion,max_depth,min_samples_split,min_samples_leaf,avgFlakyTrain,avgNonFlakyTrain,avgFlakyTest,avgNonFlakyTest,precision,recall,accuracy,f1,storage,preparationTime,predictionTime\n")

    # Define the hyperparameter grid
    param_grid = {
        "n_estimators": [100, 200],
        "criterion": ["gini", "entropy"],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    }

    # Flatten the parameter grid into all possible combinations
    keys, values = zip(*param_grid.items())
    combinations = [dict(zip(keys, v)) for v in product(*values)]

    best_params = None
    best_accuracy = 0

    for params in combinations:
        try:
            results = randomForest(outDir, extractDir, "projectName", kf, dim=0, eps=0.3, params=params)
            
            # Flatten the results tuple and write to CSV
            results_flat = ','.join(map(str, results))
            print(f"Results to be written: {params['n_estimators']},{params['criterion']},{params['max_depth']},{params['min_samples_split']},{params['min_samples_leaf']},{results_flat}")
            
            with open(os.path.join(outDir, outFile), "a") as fo:
                fo.write(f"{params['n_estimators']},{params['criterion']},{params['max_depth']},{params['min_samples_split']},{params['min_samples_leaf']},{results_flat}\n")
            
            # Update the best parameters based on accuracy
            avg_accuracy = results[6]  # Index of accuracy in the results tuple
            if avg_accuracy > best_accuracy:
                best_accuracy = avg_accuracy
                best_params = params

        except Exception as e:
            print(f"Failed for params {params}: {e}")

    print(f"Best hyperparameters found: {best_params} with accuracy: {best_accuracy}")
    print("Random Forest analysis completed. Results saved to:", outFile)


Data points before vectorization: 94
Precision: 0.6666666666666666, Recall: 0.8888888888888888, Accuracy: 0.7368421052631579, F1 Score: 0.761904761904762
0.6666666666666666 0.8888888888888888 0.7368421052631579 0.761904761904762
Precision: 0.8888888888888888, Recall: 0.8, Accuracy: 0.8421052631578947, F1 Score: 0.8421052631578948
0.8888888888888888 0.8 0.8421052631578947 0.8421052631578948
Precision: 0.6666666666666666, Recall: 0.8888888888888888, Accuracy: 0.7368421052631579, F1 Score: 0.761904761904762
0.6666666666666666 0.8888888888888888 0.7368421052631579 0.761904761904762
Precision: 0.875, Recall: 0.7, Accuracy: 0.7894736842105263, F1 Score: 0.7777777777777777
0.875 0.7 0.7894736842105263 0.7777777777777777
Precision: 0.8, Recall: 0.8888888888888888, Accuracy: 0.8421052631578947, F1 Score: 0.8421052631578948
0.8 0.8888888888888888 0.8421052631578947 0.8421052631578948
Precision: 0.8571428571428571, Recall: 0.6666666666666666, Accuracy: 0.7894736842105263, F1 Score: 0.75
0.8571428

KeyboardInterrupt: 