In [6]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Decision Tree with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define Decision Tree model with given parameters
    dt_model = DecisionTreeClassifier(
        criterion=params.get('criterion', 'entropy'),
        max_depth=params.get('max_depth', None),
        min_samples_split=params.get('min_samples_split', 2),
        min_samples_leaf=params.get('min_samples_leaf', 1),
        max_features=params.get('max_features', None),
        random_state=42
    )

        # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize metrics storage for each threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_threshold = {threshold: {'f1': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'mcc': 0}
                             for threshold in thresholds}
    successFold = 0

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the model
        dt_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = dt_model.predict_proba(X_test)

        # Calculate metrics for each threshold
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

            # Calculate metrics for this threshold
            f1 = f1_score(y_test, y_pred, zero_division=1)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            mcc = matthews_corrcoef(y_test, y_pred)

            # Accumulate metrics for the current threshold
            metrics_per_threshold[threshold]['f1'] += f1
            metrics_per_threshold[threshold]['accuracy'] += accuracy
            metrics_per_threshold[threshold]['precision'] += precision
            metrics_per_threshold[threshold]['recall'] += recall
            metrics_per_threshold[threshold]['mcc'] += mcc

        successFold += 1

    if successFold == 0:
        print("No valid folds. Exiting.")
        return params, None

    # Average metrics over all folds
    for threshold in thresholds:
        metrics_per_threshold[threshold]['f1'] /= successFold
        metrics_per_threshold[threshold]['accuracy'] /= successFold
        metrics_per_threshold[threshold]['precision'] /= successFold
        metrics_per_threshold[threshold]['recall'] /= successFold
        metrics_per_threshold[threshold]['mcc'] /= successFold

    # Save the results for each threshold
    outFile = f"{combination_label}-params-dt-{n_splits}-folds-Threshold.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("threshold,accuracy,precision,recall,f1,mcc\n")
        # Write the data for each threshold
        for threshold in thresholds:
            fo.write(f"{threshold},{metrics_per_threshold[threshold]['accuracy']},{metrics_per_threshold[threshold]['precision']},"
                     f"{metrics_per_threshold[threshold]['recall']},{metrics_per_threshold[threshold]['f1']},{metrics_per_threshold[threshold]['mcc']}\n")

    print(f"Decision Tree analysis completed for {successFold} folds. Results saved to: {outFile}")
    return params, metrics_per_threshold


  


if __name__ == "__main__":

    params = {
        "criterion": "entropy",
        "max_depth": 300,
        "min_samples_split": 5,
        "min_samples_leaf": 1,
        "max_features" : 'log2'
    }

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Decision Tree analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", params=params)
    best_params_3folds_1, best_score_3folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal", params=params)

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform Decision Tree analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Decision Tree analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", params=params)
    best_params_3folds_2, best_score_3folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger", params=params)

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")




Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Decision Tree analysis completed for 5 folds. Results saved to: equal-params-dt-5-folds-Threshold.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Decision Tree analysis completed for 3 folds. Results saved to: equal-params-dt-3-folds-Threshold.csv
Best results for 5-fold on equal combination:
Best Parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best F1 Score: {0.1: {'f1': 0.6611732229123534, 'accuracy': 0.6163742690058479, 'precision': 0.6089466089466089, 'recall': 0.7422222222222222, 'mcc': 0.24063172927077958}, 0.2: {'f1': 0.6611732229123534, 'accuracy': 0.6163742690058479, 'precision': 0.6089466089466089, 'recall': 0.7422222222222222, 'mcc': 0.24063172927077958}, 0.30000000000000004: 

Decision Tree


In [7]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Decision Tree with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define Decision Tree model with given parameters
    dt_model = DecisionTreeClassifier(
        criterion=params.get('criterion', 'entropy'),
        max_depth=params.get('max_depth', None),
        min_samples_split=params.get('min_samples_split', 2),
        min_samples_leaf=params.get('min_samples_leaf', 1),
        max_features=params.get('max_features', None),
        random_state=42
    )

       # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_fold = {fold+1: {threshold: {'f1': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'mcc': 0}
                                 for threshold in thresholds}
                        for fold in range(n_splits)}
    successFold = 0

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the model
        dt_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = dt_model.predict_proba(X_test)

        # Calculate metrics for each threshold for this fold
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

            # Calculate metrics for this threshold
            f1 = f1_score(y_test, y_pred, zero_division=1)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            mcc = matthews_corrcoef(y_test, y_pred)

            # Store metrics for the current threshold for this fold
            metrics_per_fold[fold+1][threshold]['f1'] = f1
            metrics_per_fold[fold+1][threshold]['accuracy'] = accuracy
            metrics_per_fold[fold+1][threshold]['precision'] = precision
            metrics_per_fold[fold+1][threshold]['recall'] = recall
            metrics_per_fold[fold+1][threshold]['mcc'] = mcc

        successFold += 1

    if successFold == 0:
        print("No valid folds. Exiting.")
        return params, None

    # Save the results for each threshold and fold
    outFile = f"{combination_label}-params-dt-{n_splits}-folds-Threshold-allKfold.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("fold,threshold,accuracy,precision,recall,f1,mcc\n")
        # Write the data for each fold and each threshold
        for fold in range(1, successFold + 1):
            for threshold in thresholds:
                fo.write(f"{fold},{threshold},{metrics_per_fold[fold][threshold]['accuracy']},{metrics_per_fold[fold][threshold]['precision']},"
                         f"{metrics_per_fold[fold][threshold]['recall']},{metrics_per_fold[fold][threshold]['f1']},{metrics_per_fold[fold][threshold]['mcc']}\n")

    print(f"Decision Tree analysis completed for {successFold} folds. Results saved to: {outFile}")
    return params, metrics_per_fold



if __name__ == "__main__":

    params = {
        "criterion": "entropy",
        "max_depth": 300,
        "min_samples_split": 5,
        "min_samples_leaf": 1,
        "max_features" : 'log2'
    }

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Decision Tree analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", params=params)
    best_params_3folds_1, best_score_3folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal", params=params)

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform Decision Tree analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Decision Tree analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", params=params)
    best_params_3folds_2, best_score_3folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger", params=params)

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")



Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Decision Tree analysis completed for 5 folds. Results saved to: equal-params-dt-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Decision Tree analysis completed for 3 folds. Results saved to: equal-params-dt-3-folds-Threshold-allKfold.csv
Best results for 5-fold on equal combination:
Best Parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best F1 Score: {1: {0.1: {'f1': 0.7619047619047619, 'accuracy': 0.7368421052631579, 'precision': 0.7272727272727273, 'recall': 0.8, 'mcc': 0.47193990372426947}, 0.2: {'f1': 0.7619047619047619, 'accuracy': 0.7368421052631579, 'precision': 0.7272727272727273, 'recall': 0.8, 'mcc': 0.47193990372426947}, 0.30000000000000004: {'f1': 0

Decision Tree analysis completed for 5 folds. Results saved to: larger-params-dt-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Decision Tree analysis completed for 3 folds. Results saved to: larger-params-dt-3-folds-Threshold-allKfold.csv
Best results for 5-fold on larger non-flaky combination:
Best Parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}
Best F1 Score: {1: {0.1: {'f1': 0.5882352941176471, 'accuracy': 0.8852459016393442, 'precision': 0.7142857142857143, 'recall': 0.5, 'mcc': 0.5352255958259309}, 0.2: {'f1': 0.5882352941176471, 'accuracy': 0.8852459016393442, 'precision': 0.7142857142857143, 'recall': 0.5, 'mcc': 0.5352255958259309}, 0.30000000000000004: {'f1': 0.5882352941176471, 'accuracy': 0.8852459016393442, 'precision': 0.7142857142857143, 'recall': 0.5, 'mcc': 0.5352255958259309}, 0.4: {'f1': 0.5882352941176471, 'acc

In [11]:
#DT with hyperparameter tune


import os
import time
import zipfile
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Grid Search Decision Tree with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, param_grid):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Define the Decision Tree model and GridSearchCV
    dt_model = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(dt_model, param_grid, scoring='f1', cv=skf, n_jobs=-1)

    # Perform grid search
    grid_search.fit(Z, dataLabelsList)

    # Get the best model after grid search
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    print(f"Best hyperparameters: {best_params}")

    # Now perform cross-validation with the best model across thresholds
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_fold = []

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the best model on this fold
        best_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = best_model.predict_proba(X_test)

        # Calculate metrics for each threshold
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

            # Calculate metrics for this threshold
            f1 = f1_score(y_test, y_pred, zero_division=1)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            mcc = matthews_corrcoef(y_test, y_pred)

            metrics_per_fold.append({
                'fold': fold + 1,
                'threshold': threshold,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'mcc': mcc,
                **best_params
            })

    if len(metrics_per_fold) == 0:
        print("No valid folds. Exiting.")
        return best_params, None

    # Save the results for each threshold and fold
    df_results = pd.DataFrame(metrics_per_fold)
    outFile = f"{combination_label}-params-dt-{n_splits}-folds-Threshold-allKfold-Gridsearch.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Decision Tree analysis with Grid Search completed. Results saved to: {outFile}")
    return best_params, df_results



if __name__ == "__main__":

    param_grid = {
        'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
        'max_depth': [None, 10, 30, 50, 100, 300, 500],  # Maximum depth of each tree
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 5, 10],  # Minimum number of samples required to be at a leaf node
        'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    }

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Decision Tree analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, df_results_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", param_grid=param_grid)
    
    print("Best results for 5-fold on equal combination:")
    print(df_results_5folds_1)

    # Perform Decision Tree analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Decision Tree analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, df_results_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", param_grid=param_grid)
    
    print("Best results for 5-fold on larger combination:")
    print(df_results_5folds_2)




Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Decision Tree analysis with Grid Search completed. Results saved to: equal-params-dt-5-folds-Threshold-allKfold-Gridsearch.csv
Best results for 5-fold on equal combination:
    fold  threshold  accuracy  precision    recall        f1       mcc  \
0      1        0.1  0.789474   0.714286  1.000000  0.833333  0.629941   
1      1        0.2  0.789474   0.714286  1.000000  0.833333  0.629941   
2      1        0.3  0.736842   0.692308  0.900000  0.782609  0.489345   
3      1        0.4  0.736842   0.692308  0.900000  0.782609  0.489345   
4      1        0.5  0.736842   0.692308  0.900000  0.782609  0.489345   
5      1        0.6  0.736842   0.692308  0.900000  0.782609  0.489345   
6

Random Forest

In [4]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Random forest with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define Random Forest model with given parameters
    rf_model = RandomForestClassifier(
        n_estimators=params.get('n_estimators', 100),
        criterion=params.get('criterion', 'entropy'),
        max_depth=params.get('max_depth', None),
        min_samples_split=params.get('min_samples_split', 2),
        min_samples_leaf=params.get('min_samples_leaf', 1),
        max_features=params.get('max_features', 'auto'),
        random_state=42
    )

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_fold = {fold+1: {threshold: {'f1': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'mcc': 0}
                                 for threshold in thresholds}
                        for fold in range(n_splits)}
    successFold = 0

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the model
        rf_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = rf_model.predict_proba(X_test)

        # Calculate metrics for each threshold for this fold
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

            # Calculate metrics for this threshold
            f1 = f1_score(y_test, y_pred, zero_division=1)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            mcc = matthews_corrcoef(y_test, y_pred)

            # Store metrics for the current threshold for this fold
            metrics_per_fold[fold+1][threshold]['f1'] = f1
            metrics_per_fold[fold+1][threshold]['accuracy'] = accuracy
            metrics_per_fold[fold+1][threshold]['precision'] = precision
            metrics_per_fold[fold+1][threshold]['recall'] = recall
            metrics_per_fold[fold+1][threshold]['mcc'] = mcc

        successFold += 1

    if successFold == 0:
        print("No valid folds. Exiting.")
        return params, None

    # Save the results for each threshold and fold
    outFile = f"{combination_label}-params-rf-{n_splits}-folds-Threshold-allKfold.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("fold,threshold,accuracy,precision,recall,f1,mcc\n")
        # Write the data for each fold and each threshold
        for fold in range(1, successFold + 1):
            for threshold in thresholds:
                fo.write(f"{fold},{threshold},{metrics_per_fold[fold][threshold]['accuracy']},{metrics_per_fold[fold][threshold]['precision']}," 
                         f"{metrics_per_fold[fold][threshold]['recall']},{metrics_per_fold[fold][threshold]['f1']},{metrics_per_fold[fold][threshold]['mcc']}\n")

    print(f"Random Forest analysis completed for {successFold} folds. Results saved to: {outFile}")
    return params, metrics_per_fold


if __name__ == "__main__":

    params = {
        "n_estimators": 10,
        "criterion": "gini",
        "max_depth": 30,
        "min_samples_split": 2,
        "min_samples_leaf": 2,
        "max_features" : 'log2'
    }

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Random Forest analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Random Forest analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", params=params)
    best_params_3folds_1, best_score_3folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal", params=params)

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform Random Forest analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Random Forest analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", params=params)
    best_params_3folds_2, best_score_3folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger", params=params)

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")


Starting Random Forest analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Random Forest analysis completed for 5 folds. Results saved to: equal-params-rf-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Random Forest analysis completed for 3 folds. Results saved to: equal-params-rf-3-folds-Threshold-allKfold.csv
Best results for 5-fold on equal combination:
Best Parameters: {'n_estimators': 10, 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}
Best F1 Score: {1: {0.1: {'f1': 0.7142857142857143, 'accuracy': 0.5789473684210527, 'precision': 0.5555555555555556, 'recall': 1.0, 'mcc': 0.24845199749997662}, 0.2: {'f1': 0.7142857142857143, 'accuracy': 0.5789473684210527, 'precision': 0.5555555555555556, 'recall': 1.0, 'mcc': 0.24845199749997662}, 0.30000000000

Random Forest analysis completed for 5 folds. Results saved to: larger-params-rf-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Random Forest analysis completed for 3 folds. Results saved to: larger-params-rf-3-folds-Threshold-allKfold.csv
Best results for 5-fold on larger non-flaky combination:
Best Parameters: {'n_estimators': 10, 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}
Best F1 Score: {1: {0.1: {'f1': 0.43478260869565216, 'accuracy': 0.5737704918032787, 'precision': 0.2777777777777778, 'recall': 1.0, 'mcc': 0.36900620230837305}, 0.2: {'f1': 0.5925925925925926, 'accuracy': 0.819672131147541, 'precision': 0.47058823529411764, 'recall': 0.8, 'mcc': 0.5148624666326216}, 0.30000000000000004: {'f1': 0.5555555555555556, 'accuracy': 0.8688524590163934, 'precision': 0.625, 'recall': 0.5, 'mcc': 0.4838541850227607}, 0.4: {'f1': 0.5, 'accuracy': 0.

In [5]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# XGBoost with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define XGBoost model with given parameters
    xgb_model = XGBClassifier(
        n_estimators=int(params.get('n_estimators', 100.0)),
        max_depth=int(params.get('max_depth', 5.0)),
        learning_rate=params.get('eta', 0.1),
        subsample=params.get('subsample', 1),
        colsample_bytree=params.get('colsample_bytree', 1),
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    )

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_fold = {fold+1: {threshold: {'f1': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'mcc': 0}
                                 for threshold in thresholds}
                        for fold in range(n_splits)}
    successFold = 0

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the model
        xgb_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = xgb_model.predict_proba(X_test)

        # Calculate metrics for each threshold for this fold
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

            # Calculate metrics for this threshold
            f1 = f1_score(y_test, y_pred, zero_division=1)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            mcc = matthews_corrcoef(y_test, y_pred)

            # Store metrics for the current threshold for this fold
            metrics_per_fold[fold+1][threshold]['f1'] = f1
            metrics_per_fold[fold+1][threshold]['accuracy'] = accuracy
            metrics_per_fold[fold+1][threshold]['precision'] = precision
            metrics_per_fold[fold+1][threshold]['recall'] = recall
            metrics_per_fold[fold+1][threshold]['mcc'] = mcc

        successFold += 1

    if successFold == 0:
        print("No valid folds. Exiting.")
        return params, None

    # Save the results for each threshold and fold
    outFile = f"{combination_label}-params-xgb-{n_splits}-folds-Threshold-allKfold.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("fold,threshold,accuracy,precision,recall,f1,mcc\n")
        # Write the data for each fold and each threshold
        for fold in range(1, successFold + 1):
            for threshold in thresholds:
                fo.write(f"{fold},{threshold},{metrics_per_fold[fold][threshold]['accuracy']},{metrics_per_fold[fold][threshold]['precision']}," 
                         f"{metrics_per_fold[fold][threshold]['recall']},{metrics_per_fold[fold][threshold]['f1']},{metrics_per_fold[fold][threshold]['mcc']}\n")

    print(f"XGBoost analysis completed for {successFold} folds. Results saved to: {outFile}")
    return params, metrics_per_fold


if __name__ == "__main__":

    # Update the parameters as per your request
    params = {
        "eta": 0.1,  # This is the learning_rate
        "max_depth": 5.0,
        "n_estimators": 100.0
    }

    # Ensure max_depth and n_estimators are integers
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform XGBoost analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting XGBoost analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", params=params)
    best_params_3folds_1, best_score_3folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal", params=params)

    print("Results for 5-fold on equal combination:")
    print(f"Parameters: {best_params_5folds_1}")
    print(f"Metrics per fold and threshold: {best_score_5folds_1}")

    print("Results for 3-fold on equal combination:")
    print(f"Parameters: {best_params_3folds_1}")
    print(f"Metrics per fold and threshold: {best_score_3folds_1}")

    # Perform XGBoost analysis for the second combination (flaky vs larger non-flaky)
    print("Starting XGBoost analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", params=params)
    best_params_3folds_2, best_score_3folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger", params=params)

    print("Results for 5-fold on larger non-flaky combination:")
    print(f"Parameters: {best_params_5folds_2}")
    print(f"Metrics per fold and threshold: {best_score_5folds_2}")

    print("Results for 3-fold on larger non-flaky combination:")
    print(f"Parameters: {best_params_3folds_2}")
    print(f"Metrics per fold and threshold: {best_score_3folds_2}")



Starting XGBoost analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost analysis completed for 5 folds. Results saved to: equal-params-xgb-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost analysis completed for 3 folds. Results saved to: equal-params-xgb-3-folds-Threshold-allKfold.csv
Results for 5-fold on equal combination:
Parameters: {'eta': 0.1, 'max_depth': 5, 'n_estimators': 100}
Metrics per fold and threshold: {1: {0.1: {'f1': 0.7407407407407407, 'accuracy': 0.631578947368421, 'precision': 0.5882352941176471, 'recall': 1.0, 'mcc': 0.3615507630310936}, 0.2: {'f1': 0.72, 'accuracy': 0.631578947368421, 'precision': 0.6, 'recall': 0.9, 'mcc': 0.2857738033247041}, 0.30000000000000004: {'f1': 0.75, 'accuracy': 0.6842105263157895, 'precision': 0.6428571428571429, 'recall': 0.9, 'mcc': 0.3905632887762015}, 0.4: {'f1': 0.782608695652174, 'accuracy': 0.7368421052631579, 'precision': 0.6923076923076923, 'recall': 0.9, 'mcc': 0.4893451639269458}, 0.5: {'f1': 0.7272727272727273, 'accuracy': 0.6842105263157895, 'precision': 0.6666666666666666, 'recall': 0.8, 'mcc': 0.3680349649825889}, 0.6: {'f1': 0.6666666666666666, 'accuracy': 0.6842105263157895, 'precision': 0.75, '

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost analysis completed for 5 folds. Results saved to: larger-params-xgb-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost analysis completed for 3 folds. Results saved to: larger-params-xgb-3-folds-Threshold-allKfold.csv
Results for 5-fold on larger non-flaky combination:
Parameters: {'eta': 0.1, 'max_depth': 5, 'n_estimators': 100}
Metrics per fold and threshold: {1: {0.1: {'f1': 0.8571428571428571, 'accuracy': 0.9508196721311475, 'precision': 0.8181818181818182, 'recall': 0.9, 'mcc': 0.8288922659741738}, 0.2: {'f1': 0.7368421052631579, 'accuracy': 0.9180327868852459, 'precision': 0.7777777777777778, 'recall': 0.7, 'mcc': 0.6897979292329923}, 0.30000000000000004: {'f1': 0.5882352941176471, 'accuracy': 0.8852459016393442, 'precision': 0.7142857142857143, 'recall': 0.5, 'mcc': 0.5352255958259309}, 0.4: {'f1': 0.5714285714285714, 'accuracy': 0.9016393442622951, 'precision': 1.0, 'recall': 0.4, 'mcc': 0.5982430416161189}, 0.5: {'f1': 0.5714285714285714, 'accuracy': 0.9016393442622951, 'precision': 1.0, 'recall': 0.4, 'mcc': 0.5982430416161189}, 0.6: {'f1': 0.3333333333333333, 'accuracy': 0.8688524590

KNN

In [8]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# KNN with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define KNN model with given parameters
    knn_model = KNeighborsClassifier(
        n_neighbors=int(params.get('n_neighbors', 5)),
        metric=params.get('metric', 'cosine'),
        weights=params.get('weights', 'distance')
    )

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_fold = {fold+1: {threshold: {'f1': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'mcc': 0}
                                 for threshold in thresholds}
                        for fold in range(n_splits)}
    successFold = 0

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the model
        knn_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = knn_model.predict_proba(X_test)

        # Calculate metrics for each threshold for this fold
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

            # Calculate metrics for this threshold
            f1 = f1_score(y_test, y_pred, zero_division=1)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            mcc = matthews_corrcoef(y_test, y_pred)

            # Store metrics for the current threshold for this fold
            metrics_per_fold[fold+1][threshold]['f1'] = f1
            metrics_per_fold[fold+1][threshold]['accuracy'] = accuracy
            metrics_per_fold[fold+1][threshold]['precision'] = precision
            metrics_per_fold[fold+1][threshold]['recall'] = recall
            metrics_per_fold[fold+1][threshold]['mcc'] = mcc

        successFold += 1

    if successFold == 0:
        print("No valid folds. Exiting.")
        return params, None

    # Save the results for each threshold and fold
    outFile = f"{combination_label}-params-knn-{n_splits}-folds-Threshold-allKfold.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("fold,threshold,accuracy,precision,recall,f1,mcc\n")
        # Write the data for each fold and each threshold
        for fold in range(1, successFold + 1):
            for threshold in thresholds:
                fo.write(f"{fold},{threshold},{metrics_per_fold[fold][threshold]['accuracy']},{metrics_per_fold[fold][threshold]['precision']}," 
                         f"{metrics_per_fold[fold][threshold]['recall']},{metrics_per_fold[fold][threshold]['f1']},{metrics_per_fold[fold][threshold]['mcc']}\n")

    print(f"KNN analysis completed for {successFold} folds. Results saved to: {outFile}")
    return params, metrics_per_fold


if __name__ == "__main__":

    # Update the parameters for KNN with cosine distance and distance-based weights
    params = {
        "n_neighbors": 5,       # Number of neighbors
        "metric": 'cosine',     # Metric to use for distance calculation
        "weights": 'distance'   # Weight points by the inverse of their distance
    }

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform KNN analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting KNN analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", params=params)
    best_params_3folds_1, best_score_3folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal", params=params)

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform KNN analysis for the second combination (flaky vs larger non-flaky)
    print("Starting KNN analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", params=params)
    best_params_3folds_2, best_score_3folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger", params=params)

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")


Starting KNN analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
KNN analysis completed for 5 folds. Results saved to: equal-params-knn-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
KNN analysis completed for 3 folds. Results saved to: equal-params-knn-3-folds-Threshold-allKfold.csv
Best results for 5-fold on equal combination:
Best Parameters: {'n_neighbors': 5, 'metric': 'cosine', 'weights': 'distance'}
Best F1 Score: {1: {0.1: {'f1': 0.7142857142857143, 'accuracy': 0.5789473684210527, 'precision': 0.5555555555555556, 'recall': 1.0, 'mcc': 0.24845199749997662}, 0.2: {'f1': 0.6666666666666666, 'accuracy': 0.5789473684210527, 'precision': 0.5714285714285714, 'recall': 0.8, 'mcc': 0.15118578920369088}, 0.30000000000000004: {'f1': 0.6363636363636364, 'accuracy': 0.5789473684210527, 'precision': 0.58333333333333

KNN analysis completed for 5 folds. Results saved to: larger-params-knn-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
KNN analysis completed for 3 folds. Results saved to: larger-params-knn-3-folds-Threshold-allKfold.csv
Best results for 5-fold on larger non-flaky combination:
Best Parameters: {'n_neighbors': 5, 'metric': 'cosine', 'weights': 'distance'}
Best F1 Score: {1: {0.1: {'f1': 0.6923076923076923, 'accuracy': 0.8688524590163934, 'precision': 0.5625, 'recall': 0.9, 'mcc': 0.6419456570298772}, 0.2: {'f1': 0.7058823529411765, 'accuracy': 0.9180327868852459, 'precision': 0.8571428571428571, 'recall': 0.6, 'mcc': 0.674156495167981}, 0.30000000000000004: {'f1': 0.6666666666666666, 'accuracy': 0.9180327868852459, 'precision': 1.0, 'recall': 0.5, 'mcc': 0.6748015581318281}, 0.4: {'f1': 0.46153846153846156, 'accuracy': 0.8852459016393442, 'precision': 1.0, 'recall': 0.3, 'mcc': 0.51360792775061}, 0.5: {'f1'

SVM

In [13]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC  # Using SVM
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# SVM with Manual Cross-Validation

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, params):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define SVM model with the given parameters
    svc_model = SVC(
        C=params.get('C', 1.0),
        kernel=params.get('kernel', 'rbf'),
        degree=params.get('degree', 3),
        gamma=params.get('gamma', 'scale'),
        probability=True,  # Enables probability estimates
        random_state=42
    )

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_fold = {fold+1: {threshold: {'f1': 0, 'accuracy': 0, 'precision': 0, 'recall': 0, 'mcc': 0}
                                 for threshold in thresholds}
                        for fold in range(n_splits)}
    successFold = 0

    for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
        X_train, X_test = Z[train_index], Z[test_index]
        y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

        if sum(y_train) == 0 or sum(y_test) == 0:
            print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
            continue

        # Train the model
        svc_model.fit(X_train, y_train)

        # Predict probabilities on test set
        y_pred_proba = svc_model.predict_proba(X_test)

        # Calculate metrics for each threshold for this fold
        for threshold in thresholds:
            y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

            # Calculate metrics for this threshold
            f1 = f1_score(y_test, y_pred, zero_division=1)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=1)
            recall = recall_score(y_test, y_pred, zero_division=1)
            mcc = matthews_corrcoef(y_test, y_pred)

            # Store metrics for the current threshold for this fold
            metrics_per_fold[fold+1][threshold]['f1'] = f1
            metrics_per_fold[fold+1][threshold]['accuracy'] = accuracy
            metrics_per_fold[fold+1][threshold]['precision'] = precision
            metrics_per_fold[fold+1][threshold]['recall'] = recall
            metrics_per_fold[fold+1][threshold]['mcc'] = mcc

        successFold += 1

    if successFold == 0:
        print("No valid folds. Exiting.")
        return params, None

    # Save the results for each threshold and fold
    outFile = f"{combination_label}-params-svm-{n_splits}-folds-Threshold-allKfold.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("fold,threshold,accuracy,precision,recall,f1,mcc\n")
        # Write the data for each fold and each threshold
        for fold in range(1, successFold + 1):
            for threshold in thresholds:
                fo.write(f"{fold},{threshold},{metrics_per_fold[fold][threshold]['accuracy']},{metrics_per_fold[fold][threshold]['precision']},"
                         f"{metrics_per_fold[fold][threshold]['recall']},{metrics_per_fold[fold][threshold]['f1']},{metrics_per_fold[fold][threshold]['mcc']}\n")

    print(f"SVM analysis completed for {successFold} folds. Results saved to: {outFile}")
    return params, metrics_per_fold


if __name__ == "__main__":

    # Updated SVM parameters
    params = {
        "C": 0.01,  # Lower C for a simpler decision boundary
        "kernel": "linear",  # Linear kernel
        "degree": 3,  # This parameter is ignored for linear kernel
        "gamma": "scale",  # This parameter is ignored for linear kernel
    }

    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform SVM analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting SVM analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal", params=params)
    best_params_3folds_1, best_score_3folds_1 = flastThreshold(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal", params=params)

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best Metrics: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best Metrics: {best_score_3folds_1}")

    # Perform SVM analysis for the second combination (flaky vs larger non-flaky)
    print("Starting SVM analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger", params=params)
    best_params_3folds_2, best_score_3folds_2 = flastThreshold(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger", params=params)

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best Metrics: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best Metrics: {best_score_3folds_2}")

   


Starting SVM analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
SVM analysis completed for 5 folds. Results saved to: equal-params-svm-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
SVM analysis completed for 3 folds. Results saved to: equal-params-svm-3-folds-Threshold-allKfold.csv
Best results for 5-fold on equal combination:
Best Parameters: {'C': 0.01, 'kernel': 'linear', 'degree': 3, 'gamma': 'scale'}
Best Metrics: {1: {0.1: {'f1': 0.6896551724137931, 'accuracy': 0.5263157894736842, 'precision': 0.5263157894736842, 'recall': 1.0, 'mcc': 0.0}, 0.2: {'f1': 0.6896551724137931, 'accuracy': 0.5263157894736842, 'precision': 0.5263157894736842, 'recall': 1.0, 'mcc': 0.0}, 0.30000000000000004: {'f1': 0.6896551724137931, 'accuracy': 0.5263157894736842, 'precision': 0.5263157894736842, 'recall': 1.0, 'mcc': 0.0}, 

SVM analysis completed for 5 folds. Results saved to: larger-params-svm-5-folds-Threshold-allKfold.csv
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
SVM analysis completed for 3 folds. Results saved to: larger-params-svm-3-folds-Threshold-allKfold.csv
Best results for 5-fold on larger non-flaky combination:
Best Parameters: {'C': 0.01, 'kernel': 'linear', 'degree': 3, 'gamma': 'scale'}
Best Metrics: {1: {0.1: {'f1': 0.32786885245901637, 'accuracy': 0.32786885245901637, 'precision': 0.19607843137254902, 'recall': 1.0, 'mcc': 0.19607843137254902}, 0.2: {'f1': 0.2857142857142857, 'accuracy': 0.8360655737704918, 'precision': 0.5, 'recall': 0.2, 'mcc': 0.24047024221824384}, 0.30000000000000004: {'f1': 0.3333333333333333, 'accuracy': 0.8688524590163934, 'precision': 1.0, 'recall': 0.2, 'mcc': 0.4157900382791817}, 0.4: {'f1': 0.18181818181818182, 'accuracy': 0.8524590163934426, 'precision': 1.0, 'recall': 0.1, 'mcc': 0.291547594742265}, 0.5: {

NB