In [14]:
import os
import time
import warnings
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit,GridSearchCV
from sklearn.exceptions import UndefinedMetricWarning
from itertools import product
import csv

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

###############################################################################
# read data from file

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def computeResults(testLabels, predictLabels):
    try:
        precision = precision_score(testLabels, predictLabels)
        recall = recall_score(testLabels, predictLabels)
        accuracy = accuracy_score(testLabels, predictLabels)
        f1 = f1_score(testLabels, predictLabels)
        
        print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}, F1 Score: {f1}")
    except Exception as e:
        print(f"Error computing metrics: {e}")
        precision = recall = accuracy = f1 = "-"
    return precision, recall, accuracy, f1

###############################################################################
# FLAST

def vectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer(stop_words=None)  # Disable stop words filtering
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

def classificationDecisionTree(trainData, trainLabels, testData, params):
    # training
    t0 = time.perf_counter()
    clf = DecisionTreeClassifier(
        criterion=params.get("criterion", "gini"),
        splitter=params.get("splitter", "best"),
        max_depth=params.get("max_depth"),
        min_samples_split=params.get("min_samples_split", 2),
        min_samples_leaf=params.get("min_samples_leaf", 1),
    )
    clf.fit(trainData, trainLabels)
    t1 = time.perf_counter()
    trainTime = t1 - t0

    t0 = time.perf_counter()
    predictLabels = clf.predict(testData)
    t1 = time.perf_counter()
    testTime = t1 - t0

    return trainTime, testTime, predictLabels

def decisionTreeWithGridSearch(outDir, projectBasePath, projectName, kf, dim, eps, csv_filename):
    v0 = time.perf_counter()
    
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky
    print("Data points before vectorization:", len(dataPoints))
    Z = vectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    v1 = time.perf_counter()
    vecTime = v1 - v0

    # Define the parameter grid for GridSearchCV
    param_grid = {
        "criterion": ["gini", "entropy"],
        "splitter": ["best", "random"],
        "max_depth": [10,None, 20, 30],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    }

    # Open the CSV file for writing results
    with open(csv_filename, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["criterion","cvTest", "splitter", "max_depth", "min_samples_split", "min_samples_leaf", "precision", "recall", "accuracy", "f1"])  # CSV header

        # Split the data and perform GridSearchCV
        for (train_index, test_index) in kf.split(dataPointsList, dataLabelsList):
            trainData, testData = dataPointsList[train_index], dataPointsList[test_index]
            trainLabels, testLabels = dataLabelsList[train_index], dataLabelsList[test_index]

            # Flatten the train and test data
            nSamplesTrainData, nxTrain, nyTrain = trainData.shape
            trainData = trainData.reshape((nSamplesTrainData, nxTrain * nyTrain))
            nSamplesTestData, nxTest, nyTest = testData.shape
            testData = testData.reshape((nSamplesTestData, nxTest * nyTest))

            # Instantiate DecisionTreeClassifier
            dt = DecisionTreeClassifier(random_state=42)

            for cvTest in [3,5]:
                # GridSearchCV: searching for the best hyperparameters
                grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cvTest, n_jobs=-1, verbose=1)
                grid_search.fit(trainData, trainLabels)

                # Retrieve the best parameters from the grid search
                best_params = grid_search.best_params_
                print(f"Best hyperparameters: {best_params}")

                # Train the DecisionTree with the best parameters and measure times
                trainTime, testTime, predictLabels = classificationDecisionTree(trainData, trainLabels, testData, best_params)
                precision, recall, accuracy, f1 = computeResults(testLabels, predictLabels)

                # Write the results to the CSV file
                writer.writerow([best_params['criterion'],cvTest, best_params['splitter'], best_params['max_depth'], best_params['min_samples_split'], best_params['min_samples_leaf'], precision, recall, accuracy, f1])

    return best_params

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    nonFlakyZip = "reduced_nonflaky_files.zip"
    extractDir = "extracted"
    outDir = "results_DecisionTree"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    ### 여기에 split 필요없음 왜냐, gridsearchCV 
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    # Define the CSV file for results
    csv_filename = os.path.join(outDir, "params-DecisionTree.csv")
    
    
    # Cross validation 있으니까 없애라
    # Run Decision Tree with Grid Search and save results to CSV
    best_params = decisionTreeWithGridSearch(outDir, extractDir, "projectName", kf, dim=0, eps=0.3, csv_filename=csv_filename)
    print(f"Best hyperparameters found: {best_params}")
    print("Decision Tree analysis completed.")

Data points before vectorization: 94
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Results for CV=3:
Mean score: 0.6703629032258065, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}, Train time: 0.0186s, Test time: 0.0013s
Mean score: 0.7550403225806451, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}, Train time: 0.0113s, Test time: 0.0022s
Mean score: 0.6811155913978495, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}, Train time: 0.0154s, Test time: 0.0010s
Mean score: 0.7029569892473119, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'random'}, Train time: 0.0034s, Test time: 0.0008s
Mean score: 0.6599462365591399, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'spli

Results for CV=5:
Mean score: 0.5327485380116959, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}, Train time: 0.0146s, Test time: 0.0009s
Mean score: 0.5637426900584795, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}, Train time: 0.0045s, Test time: 0.0010s
Mean score: 0.5432748538011696, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}, Train time: 0.0136s, Test time: 0.0008s
Mean score: 0.5403508771929825, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'random'}, Train time: 0.0040s, Test time: 0.0008s
Mean score: 0.5210526315789473, Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}, Train time: 0.0136s, Test time: 0.0009s
Mean score: 0.607017543859649, Params: {'cr