In [12]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Decision Tree with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define Decision Tree model with given parameters
    dt_model = DecisionTreeClassifier(
        criterion=params.get('criterion', 'entropy'),
        max_depth=params.get('max_depth', None),
        min_samples_split=params.get('min_samples_split', 2),
        min_samples_leaf=params.get('min_samples_leaf', 1),
        max_features=params.get('max_features', None),
        random_state=42
    )

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize metrics
    thresholds = np.linspace(0.1, 0.9, 9)
    total_f1 = 0.0
    total_accuracy = 0.0
    total_precision = 0.0
    total_recall = 0.0
    total_mcc = 0.0
    total_preparationTime = 0.0
    best_thresholds = []
    successFold = 0

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the model
        dt_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = dt_model.predict_proba(X_test)

        # Find the best threshold for this fold
        best_threshold = 0.5
        '''
        best_f1 = 0.0
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)
            f1 = f1_score(y_test, y_pred, zero_division=1)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
                
                '''
         # Set the threshold to 0.5 and calculate F1 score
        threshold = 0.5  # Fixed threshold
        y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)
        f1 = f1_score(y_test, y_pred, zero_division=1)

        best_f1 = f1
        best_threshold = threshold  # Always set to 0.5


        best_thresholds.append(best_threshold)
        print(f"Fold {fold+1}: Best Threshold: {best_threshold}, Best F1 Score: {best_f1}")

        # Calculate other metrics using the best threshold
        y_pred = (y_pred_proba[:, 1] >= best_threshold).astype(int)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=1)
        recall = recall_score(y_test, y_pred, zero_division=1)
        f1 = f1_score(y_test, y_pred, zero_division=1)  # Recalculate F1 score here
        mcc = matthews_corrcoef(y_test, y_pred)
        preparationTime = vecTime / len(dataPoints)

        total_f1 += f1  # Accumulate the recalculated F1 score
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_mcc += mcc
        total_preparationTime += preparationTime
        successFold += 1

    if successFold == 0:
        print("No valid folds. Exiting.")
        return params, None

    # Compute average metrics over successful folds
    avg_f1 = total_f1 / successFold
    avg_accuracy = total_accuracy / successFold
    avg_precision = total_precision / successFold
    avg_recall = total_recall / successFold
    avg_mcc = total_mcc / successFold
    avg_preparationTime = total_preparationTime / successFold
    avg_threshold = np.mean(best_thresholds)

    # Save the results
    outFile = f"{combination_label}-params-dt-{n_splits}-folds-Threshold.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("criterion,max_depth,min_samples_split,min_samples_leaf,max_features,threshold,accuracy,precision,recall,f1,mcc,preparationTime\n")
        # Write the data row
        fo.write(f"{params.get('criterion', 'entropy')},{params.get('max_depth', None)},{params.get('min_samples_split', 2)},"
                 f"{params.get('min_samples_leaf', 1)},{params.get('max_features', None)},{avg_threshold},"
                 f"{avg_accuracy},{avg_precision},{avg_recall},{avg_f1},{avg_mcc},{avg_preparationTime}\n")

    print(f"Decision Tree analysis completed for {successFold} folds. Results saved to: {outFile}")
    return params, avg_f1


if __name__ == "__main__":

    params = {
        "criterion": "entropy",
        "max_depth": 300,
        "min_samples_split": 5,
        "min_samples_leaf": 1,
        "max_features" : 'log2'
    }

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Decision Tree analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", params=params)
    best_params_3folds_1, best_score_3folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal", params=params)

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform Decision Tree analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Decision Tree analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", params=params)
    best_params_3folds_2, best_score_3folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger", params=params)

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")




Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fold 1: Best Threshold: 0.5, Best F1 Score: 0.631578947368421
Fold 2: Best Threshold: 0.5, Best F1 Score: 0.7777777777777778
Fold 3: Best Threshold: 0.5, Best F1 Score: 0.7058823529411765
Fold 4: Best Threshold: 0.5, Best F1 Score: 0.7058823529411765
Fold 5: Best Threshold: 0.5, Best F1 Score: 0.5714285714285714
Decision Tree analysis completed for 5 folds. Results saved to: equal-params-dt-5-folds-Threshold.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fold 1: Best Threshold: 0.5, Best F1 Score: 0.46153846153846156
Fold 2: Best Threshold: 0.5, Best F1 Score: 0.5945945945945946
Fold 3: Best Threshold: 0.5, Best F1 Score: 0.5806451612903226
Decision Tree analysis completed for 3 folds. Results saved to: equal-params-dt-3-folds-Threshold.csv
Best results for 5-fol