In [4]:
# Cell 1

import os
import time
import zipfile
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from itertools import product
from sklearn.model_selection import StratifiedKFold



def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Performs vectorization using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# Data Extraction and Vectorization

# Paths to your datasets
flakyZip = "compressedDataset/flaky_files.zip"
# You can choose between 'reduced_nonflaky_files.zip' (balanced) or 'all_nonflaky_files.zip' (unbalanced)
nonFlakyZip_equal = "compressedDataset/reduced_nonflaky_files.zip"  # Balanced dataset
nonFlakyZip_larger = "compressedDataset/all_nonflaky_files.zip"     # Unbalanced dataset

# Create directories
outDirEqual = "results/equal_flaky_nonflaky/"
outDirLarger = "results/larger_nonflaky/"
os.makedirs(outDirEqual, exist_ok=True)
os.makedirs(outDirLarger, exist_ok=True)

extractDirEqual = "extracted/equal_flaky_nonflaky/"
extractDirLarger = "extracted/larger_nonflaky/"
os.makedirs(extractDirEqual, exist_ok=True)
os.makedirs(extractDirLarger, exist_ok=True)

# Extract and read data for equal combination
flakyDirEqual = os.path.join(extractDirEqual, 'flaky')
nonFlakyDirEqual = os.path.join(extractDirEqual, 'nonFlaky')
os.makedirs(flakyDirEqual, exist_ok=True)
os.makedirs(nonFlakyDirEqual, exist_ok=True)

extract_zip(flakyZip, flakyDirEqual)
extract_zip(nonFlakyZip_equal, nonFlakyDirEqual)

dataPointsFlakyEqual = getDataPoints(flakyDirEqual)
dataPointsNonFlakyEqual = getDataPoints(nonFlakyDirEqual)
dataPointsEqual = dataPointsFlakyEqual + dataPointsNonFlakyEqual

# Print the number of datasets for equal combination
print(f"Number of flaky documents (equal combination): {len(dataPointsFlakyEqual)}")
print(f"Number of non-flaky documents (equal combination): {len(dataPointsNonFlakyEqual)}")
print(f"Total number of documents (equal combination): {len(dataPointsEqual)}")

dataLabelsListEqual = np.array([1]*len(dataPointsFlakyEqual) + [0]*len(dataPointsNonFlakyEqual))

# Vectorize data
Z_equal = flastVectorization(dataPointsEqual)

print("Shape of vectorized data (equal combination):", Z_equal.shape)

# Extract and read data for larger non-flaky combination
flakyDirLarger = os.path.join(extractDirLarger, 'flaky')
nonFlakyDirLarger = os.path.join(extractDirLarger, 'nonFlaky')
os.makedirs(flakyDirLarger, exist_ok=True)
os.makedirs(nonFlakyDirLarger, exist_ok=True)

extract_zip(flakyZip, flakyDirLarger)
extract_zip(nonFlakyZip_larger, nonFlakyDirLarger)

dataPointsFlakyLarger = getDataPoints(flakyDirLarger)
dataPointsNonFlakyLarger = getDataPoints(nonFlakyDirLarger)
dataPointsLarger = dataPointsFlakyLarger + dataPointsNonFlakyLarger

# Print the number of datasets for larger combination
print(f"Number of flaky documents (larger combination): {len(dataPointsFlakyLarger)}")
print(f"Number of non-flaky documents (larger combination): {len(dataPointsNonFlakyLarger)}")
print(f"Total number of documents (larger combination): {len(dataPointsLarger)}")

dataLabelsListLarger = np.array([1]*len(dataPointsFlakyLarger) + [0]*len(dataPointsNonFlakyLarger))

# Vectorize data
Z_larger = flastVectorization(dataPointsLarger)

print("Shape of vectorized data (larger combination):", Z_larger.shape)

print("Data preprocessing completed.")
def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

Number of flaky documents (equal combination): 45
Number of non-flaky documents (equal combination): 45
Total number of documents (equal combination): 90
Shape of vectorized data (equal combination): (90, 7563)
Number of flaky documents (larger combination): 45
Number of non-flaky documents (larger combination): 254
Total number of documents (larger combination): 299
Shape of vectorized data (larger combination): (299, 11986)
Data preprocessing completed.


In [5]:

from sklearn.neighbors import KNeighborsClassifier



def flastKNNWithThresholds(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization without PCA
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Define the pipeline with PCA and KNN
        pipeline = Pipeline([
            ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
            ('knn', KNeighborsClassifier(
                n_neighbors=param_dict['n_neighbors'],
                weights=param_dict['weights'],
                metric=param_dict['metric'],
                n_jobs=-1
            )),
        ])

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Convert to dense format for PCA
            X_train_dense = X_train.toarray()
            X_test_dense = X_test.toarray()

            # Train the model
            pipeline.fit(X_train_dense, y_train)

            # Predict probabilities on test set
            if hasattr(pipeline.named_steps['knn'], "predict_proba"):
                y_pred_proba = pipeline.predict_proba(X_test_dense)
            else:
                # If predict_proba is not available, use distance-based probabilities
                distances, indices = pipeline.named_steps['knn'].kneighbors(X_test_dense)
                weights = pipeline.named_steps['knn']._get_weights(distances)
                y_pred_proba = np.zeros((X_test_dense.shape[0], 2))
                for i, neighbors in enumerate(indices):
                    neighbor_labels = y_train[neighbors]
                    if weights is None:
                        proba = np.bincount(neighbor_labels, minlength=2) / pipeline.named_steps['knn'].n_neighbors
                    else:
                        proba = np.bincount(neighbor_labels, weights=weights[i], minlength=2) / weights[i].sum()
                    y_pred_proba[i] = proba

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred)

                metrics_per_combination.append({
                    'pca__n_components': param_dict['pca__n_components'],
                    'n_neighbors': param_dict['n_neighbors'],
                    'weights': param_dict['weights'],
                    'metric': param_dict['metric'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Convert the list of metrics into a DataFrame
    df_results = pd.DataFrame(metrics_per_combination)

    # Save the per-fold results
    outFile = f"{combination_label}-thresholds-knn-results-per-fold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Per-fold KNN analysis completed. Results saved to: {outFile}")

    # Compute the average metrics across folds for each combination of hyperparameters and thresholds
    grouped_metrics = df_results.groupby(
        ['pca__n_components', 'n_neighbors', 'weights', 'metric', 'threshold']
    ).agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'mcc': ['mean', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    grouped_metrics.columns = [
        '_'.join(col).strip('_') if col[1] else col[0]
        for col in grouped_metrics.columns.values
    ]

    # Save the averaged results
    outFileAvg = f"{combination_label}-thresholds-knn-results-averaged.csv"
    grouped_metrics.to_csv(os.path.join(outDir, outFileAvg), index=False)

    print(f"Averaged results saved to: {outFileAvg}")

    return df_results, grouped_metrics

if __name__ == "__main__":

    #param grid
    param_grid = {
        'pca__n_components': [180,200,220],  
        'n_neighbors': [3, 5, 7, 9, 11, 15, 20],
        'weights': ['uniform', 'distance'],
        'metric': ['cosine', 'euclidean'],
    }

    # Paths to your datasets
    flakyZip = "compressedDataset/flaky_files.zip"   
    nonFlakyZip = "compressedDataset/all_nonflaky_files.zip"  
    
    # Create result and extract directories
    outDir = "results/knn_thresholds/"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "extracted/knn_thresholds/"
    os.makedirs(extractDir, exist_ok=True)

    # Perform KNN analysis with threshold adjustments
    print("Starting KNN analysis with threshold adjustments...")
    df_results, grouped_metrics = flastKNNWithThresholds(
        outDir, flakyZip, nonFlakyZip, extractDir, n_splits=5, combination_label="knn_thresholds", param_grid=param_grid)

    if df_results is not None:
        print("Analysis completed. Per-fold Results:")
        print(df_results.head())

        print("Averaged Results:")
        print(grouped_metrics.head())
    else:
        print("Analysis did not produce any results.")


Starting KNN analysis with threshold adjustments...
Number of flaky documents: 45
Number of non-flaky documents: 254
Total number of documents: 299
Training with parameters: {'pca__n_components': 180, 'n_neighbors': 3, 'weights': 'uniform', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 180, 'n_neighbors': 3, 'weights': 'uniform', 'metric': 'euclidean'}
Training with parameters: {'pca__n_components': 180, 'n_neighbors': 3, 'weights': 'distance', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 180, 'n_neighbors': 3, 'weights': 'distance', 'metric': 'euclidean'}
Training with parameters: {'pca__n_components': 180, 'n_neighbors': 5, 'weights': 'uniform', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 180, 'n_neighbors': 5, 'weights': 'uniform', 'metric': 'euclidean'}
Training with parameters: {'pca__n_components': 180, 'n_neighbors': 5, 'weights': 'distance', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 1

Training with parameters: {'pca__n_components': 220, 'n_neighbors': 9, 'weights': 'distance', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 9, 'weights': 'distance', 'metric': 'euclidean'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 11, 'weights': 'uniform', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 11, 'weights': 'uniform', 'metric': 'euclidean'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 11, 'weights': 'distance', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 11, 'weights': 'distance', 'metric': 'euclidean'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 15, 'weights': 'uniform', 'metric': 'cosine'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 15, 'weights': 'uniform', 'metric': 'euclidean'}
Training with parameters: {'pca__n_components': 220, 'n_neighbors': 15, 'w

## SVM

In [6]:

from sklearn.svm import SVC





def flastSVMWithThresholds(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization 
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Define the pipeline
        pipeline = Pipeline([
            ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
            ('svm', SVC(
                C=param_dict['svm__C'],
                kernel=param_dict['svm__kernel'],
                probability=True,  # Enable probability estimates
                random_state=42
            )),
        ])

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Convert to dense format for PCA
            X_train_dense = X_train.toarray()
            X_test_dense = X_test.toarray()

            # Train the model
            pipeline.fit(X_train_dense, y_train)

            # Predict probabilities on test set
            y_pred_proba = pipeline.predict_proba(X_test_dense)

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred)

                metrics_per_combination.append({
                    'pca__n_components': param_dict['pca__n_components'],
                    'svm__C': param_dict['svm__C'],
                    'svm__kernel': param_dict['svm__kernel'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Convert the list of metrics into a DataFrame
    df_results = pd.DataFrame(metrics_per_combination)

    # Save the per-fold results
    outFile = f"{combination_label}-thresholds-svm-results-per-fold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Per-fold SVM analysis completed. Results saved to: {outFile}")

    # Compute the average metrics across folds for each combination of hyperparameters and thresholds
    grouped_metrics = df_results.groupby(
        ['pca__n_components', 'svm__C', 'svm__kernel', 'threshold']
    ).agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'mcc': ['mean', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    grouped_metrics.columns = [
        '_'.join(col).strip('_') if col[1] else col[0]
        for col in grouped_metrics.columns.values
    ]

    # Save the averaged results
    outFileAvg = f"{combination_label}-thresholds-svm-results-averaged.csv"
    grouped_metrics.to_csv(os.path.join(outDir, outFileAvg), index=False)

    print(f"Averaged results saved to: {outFileAvg}")

    return df_results, grouped_metrics

if __name__ == "__main__":

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'pca__n_components': [180,200,220],  
        'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
        'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel types
    }

    # Paths to your datasets
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/all_nonflaky_files.zip"  

    # Create result and extract directories
    outDir = "results/svm_thresholds/"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "extracted/svm_thresholds/"
    os.makedirs(extractDir, exist_ok=True)

    # Perform SVM analysis with threshold adjustments
    print("Starting SVM analysis with threshold adjustments...")
    df_results, grouped_metrics = flastSVMWithThresholds(
        outDir, flakyZip, nonFlakyZip, extractDir, n_splits=5, combination_label="svm_thresholds", param_grid=param_grid)

    if df_results is not None:
        print("Analysis completed. Per-fold Results:")
        print(df_results.head())

        print("Averaged Results:")
        print(grouped_metrics.head())
    else:
        print("Analysis did not produce any results.")


Starting SVM analysis with threshold adjustments...
Number of flaky documents: 45
Number of non-flaky documents: 254
Total number of documents: 299
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.01, 'svm__kernel': 'rbf'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.01, 'svm__kernel': 'poly'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.01, 'svm__kernel': 'sigmoid'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.1, 'svm__kernel': 'linear'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.1, 'svm__kernel': 'rbf'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.1, 'svm__kernel': 'poly'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 0.1, 'svm__kernel': 'sigmoid'}
Training with parameters: {'pca__n_components': 180, 'svm__C': 1.0, 'svm__kernel': 'linear'}
Training with param

## NB

In [32]:
import os
import time
import zipfile
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from itertools import product


def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        from sklearn.random_projection import SparseRandomProjection
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z


def flastNBWithThresholds(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label, param_grid):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")
        
        # Initialize MultinomialNB with current hyperparameter combination
        nb_model = MultinomialNB(
            alpha=param_dict['alpha']
        )
        
        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Train the model
            nb_model.fit(X_train, y_train)

            # Predict probabilities on test set
            y_pred_proba = nb_model.predict_proba(X_test)

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred)

                metrics_per_combination.append({
                    'alpha': param_dict['alpha'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Convert the list of metrics into a DataFrame
    df_results = pd.DataFrame(metrics_per_combination)

    # Save the per-fold results
    outFile = f"{combination_label}-thresholds-nb-results-per-fold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Per-fold Naive Bayes analysis completed. Results saved to: {outFile}")

    # Compute the average metrics across folds for each combination of hyperparameters and thresholds
    grouped_metrics = df_results.groupby(
        ['alpha', 'threshold']
    ).agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'mcc': ['mean', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    grouped_metrics.columns = [
        '_'.join(col).strip('_') if col[1] else col[0]
        for col in grouped_metrics.columns.values
    ]

    # Save the averaged results
    outFileAvg = f"{combination_label}-thresholds-nb-results-averaged.csv"
    grouped_metrics.to_csv(os.path.join(outDir, outFileAvg), index=False)

    print(f"Averaged results saved to: {outFileAvg}")

    return df_results, grouped_metrics

if __name__ == "__main__":

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'pca__n_components': [0.9, 0.95, 0.99],  
        'alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
    }

    flakyZip = "compressedDataset/flaky_files.zip"

    nonFlakyZip = "compressedDataset/all_nonflaky_files.zip"  # Unbalanced dataset

    outDir = "results/nb_thresholds/"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "extracted/nb_thresholds/"
    os.makedirs(extractDir, exist_ok=True)

    # Perform Naive Bayes analysis with threshold adjustments
    print("Starting NB analysis with threshold adjustments...")
    df_results, grouped_metrics = flastNBWithThresholds(
        outDir, flakyZip, nonFlakyZip, extractDir, n_splits=5, dim=100, eps=0.3,
        combination_label="nb_thresholds", param_grid=param_grid)

    if df_results is not None:
        print("Analysis completed. Per-fold Results:")
        print(df_results.head())

        print("Averaged Results:")
        print(grouped_metrics.head())
    else:
        print("Analysis did not produce any results.")


Starting NB analysis with threshold adjustments...
Number of flaky documents: 45
Number of non-flaky documents: 254
Total number of documents: 299
Training with parameters: {'pca__n_components': 0.9, 'alpha': 0.1}


ValueError: Negative values in data passed to MultinomialNB (input X)

## XGB

In [8]:
import os
import time
import zipfile
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from itertools import product
from xgboost import XGBClassifier  # Import XGBoost classifier

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Performs vectorization using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# Function to run XGBoost analysis with PCA and threshold adjustments

def flastXGBWithThresholds(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")
        
        # Define the pipeline with PCA and XGBoost
        pipeline = Pipeline([
            ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
            ('xgb', XGBClassifier(
                learning_rate=param_dict['xgb__learning_rate'],
                max_depth=param_dict['xgb__max_depth'],
                n_estimators=param_dict['xgb__n_estimators'],
                use_label_encoder=False,
                eval_metric='logloss',
                verbosity=0,
                n_jobs=-1,
                random_state=42
            )),
        ])
        
        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Convert to dense format for PCA
            X_train_dense = X_train.toarray()
            X_test_dense = X_test.toarray()

            # Train the model
            pipeline.fit(X_train_dense, y_train)

            # Predict probabilities on test set
            y_pred_proba = pipeline.predict_proba(X_test_dense)

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred)

                metrics_per_combination.append({
                    'pca__n_components': param_dict['pca__n_components'],
                    'xgb__learning_rate': param_dict['xgb__learning_rate'],
                    'xgb__max_depth': param_dict['xgb__max_depth'],
                    'xgb__n_estimators': param_dict['xgb__n_estimators'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Convert the list of metrics into a DataFrame
    df_results = pd.DataFrame(metrics_per_combination)

    # Save the per-fold results
    outFile = f"{combination_label}-thresholds-xgb-results-per-fold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Per-fold XGBoost analysis completed. Results saved to: {outFile}")

    # Compute the average metrics across folds for each combination of hyperparameters and thresholds
    grouped_metrics = df_results.groupby(
        ['pca__n_components', 'xgb__learning_rate', 'xgb__max_depth', 'xgb__n_estimators', 'threshold']
    ).agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'mcc': ['mean', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    grouped_metrics.columns = [
        '_'.join(col).strip('_') if col[1] else col[0]
        for col in grouped_metrics.columns.values
    ]

    # Save the averaged results
    outFileAvg = f"{combination_label}-thresholds-xgb-results-averaged.csv"
    grouped_metrics.to_csv(os.path.join(outDir, outFileAvg), index=False)

    print(f"Averaged results saved to: {outFileAvg}")

    return df_results, grouped_metrics

if __name__ == "__main__":

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'pca__n_components': [180, 200, 220],
        'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],
        'xgb__max_depth': [3, 5, 7, 10],
        'xgb__n_estimators': [50, 100, 200, 300],
    }

    # Paths to your datasets
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/all_nonflaky_files.zip"  # Unbalanced dataset

    # Create result and extract directories
    outDir = "results/xgb_thresholds/"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "extracted/xgb_thresholds/"
    os.makedirs(extractDir, exist_ok=True)

    # Perform XGBoost analysis with threshold adjustments
    print("Starting XGBoost analysis with threshold adjustments...")
    df_results, grouped_metrics = flastXGBWithThresholds(
        outDir, flakyZip, nonFlakyZip, extractDir, n_splits=5,
        combination_label="xgb_thresholds", param_grid=param_grid)

    if df_results is not None:
        print("Analysis completed. Per-fold Results:")
        print(df_results.head())

        print("Averaged Results:")
        print(grouped_metrics.head())
    else:
        print("Analysis did not produce any results.")


Starting XGBoost analysis with threshold adjustments...
Number of flaky documents: 45
Number of non-flaky documents: 254
Total number of documents: 299
Training with parameters: {'pca__n_components': 180, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 50}
Training with parameters: {'pca__n_components': 180, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
Training with parameters: {'pca__n_components': 180, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 200}
Training with parameters: {'pca__n_components': 180, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 300}
Training with parameters: {'pca__n_components': 180, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 50}
Training with parameters: {'pca__n_components': 180, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 100}
Training with parameters: {'pca__n_components': 180, 'xgb__learning_rate': 0.01, '

Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 50}
Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 200}
Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 300}
Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 50}
Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 100}
Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 200}
Training with parameters: {'pca__n_components': 200, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb

Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 200}
Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 300}
Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 50}
Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 100}
Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 200}
Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 300}
Training with parameters: {'pca__n_components': 220, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 7, 'xg

## Random Forest (its gonna take really long)

In [9]:
from sklearn.ensemble import RandomForestClassifier




def flastVectorization(dataPoints):
    """Performs vectorization using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z


def flastRFWithThresholds(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization without PCA
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Define the pipeline
        pipeline = Pipeline([
            ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
            ('rf', RandomForestClassifier(
                n_estimators=param_dict['rf__n_estimators'],
                criterion=param_dict['rf__criterion'],
                max_depth=param_dict['rf__max_depth'],
                min_samples_split=param_dict['rf__min_samples_split'],
                min_samples_leaf=param_dict['rf__min_samples_leaf'],
                random_state=42,
                n_jobs=-1
            )),
        ])

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Convert to dense format for PCA
            X_train_dense = X_train.toarray()
            X_test_dense = X_test.toarray()

            # Train the model
            pipeline.fit(X_train_dense, y_train)

            # Predict probabilities on test set
            y_pred_proba = pipeline.predict_proba(X_test_dense)

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred)

                metrics_per_combination.append({
                    'pca__n_components': param_dict['pca__n_components'],
                    'rf__n_estimators': param_dict['rf__n_estimators'],
                    'rf__criterion': param_dict['rf__criterion'],
                    'rf__max_depth': param_dict['rf__max_depth'],
                    'rf__min_samples_split': param_dict['rf__min_samples_split'],
                    'rf__min_samples_leaf': param_dict['rf__min_samples_leaf'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Convert the list of metrics into a DataFrame
    df_results = pd.DataFrame(metrics_per_combination)

    # Save the per-fold results
    outFile = f"{combination_label}-thresholds-rf-results-per-fold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Per-fold Random Forest analysis completed. Results saved to: {outFile}")

    # Compute the average metrics across folds for each combination of hyperparameters and thresholds
    grouped_metrics = df_results.groupby(
        ['pca__n_components', 'rf__n_estimators', 'rf__criterion', 'rf__max_depth', 'rf__min_samples_split', 'rf__min_samples_leaf', 'threshold']
    ).agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'mcc': ['mean', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    grouped_metrics.columns = [
        '_'.join(col).strip('_') if col[1] else col[0] 
        for col in grouped_metrics.columns.values
    ]

    # Save the averaged results
    outFileAvg = f"{combination_label}-thresholds-rf-results-averaged.csv"
    grouped_metrics.to_csv(os.path.join(outDir, outFileAvg), index=False)

    print(f"Averaged results saved to: {outFileAvg}")

    return df_results, grouped_metrics

if __name__ == "__main__":
    #parameter grid
    param_grid = {
        'pca__n_components': [180,200,220],  
        'rf__n_estimators': [10, 50, 100, 300, 500],  
        'rf__criterion': ['gini', 'entropy'],   
        'rf__max_depth': [10, 30, 50, 100, 300, 500],  
        'rf__min_samples_split': [2, 5],       
        'rf__min_samples_leaf': [1, 2],        
    }

    # Paths to your datasets
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/all_nonflaky_files.zip"  

    # Create result and extract directories
    outDir = "results/rf_thresholds/"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "extracted/rf_thresholds/"
    os.makedirs(extractDir, exist_ok=True)

    # Perform Random Forest analysis with threshold adjustments
    print("Starting Random Forest analysis with threshold adjustments...")
    df_results, grouped_metrics = flastRFWithThresholds(
        outDir, flakyZip, nonFlakyZip, extractDir, n_splits=5,
        combination_label="rf_thresholds", param_grid=param_grid)

    if df_results is not None:
        print("Analysis completed. Per-fold Results:")
        print(df_results.head())

        print("Averaged Results:")
        print(grouped_metrics.head())
    else:
        print("Analysis did not produce any results.")


Starting Random Forest analysis with threshold adjustments...
Number of flaky documents: 45
Number of non-flaky documents: 254
Total number of documents: 299
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'gini', 'rf__max_depth': 30, 'rf__mi

Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'gin

Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 100, 'rf__criteri

Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 100, 'rf__cr

Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 300, 'rf__cr

Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 180, 'rf__n_estimators': 500, 'rf__cr

Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 10, 'rf__criterion

Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 50, 'rf__criterion':

Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 100, 'rf__criteri

Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 300, 'rf__criterion': 'entropy', 'rf__max_depth': 50, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 300, 'rf__criteri

Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 500, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 200, 'rf__n_estimators': 500, 'rf__criteri

Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 10, 'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 10, 'rf__criterion': '

Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 50, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 50, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 50, 'rf__criterion': 'en

Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 100, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 100, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 100, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 100, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 100, 'rf__criterion': '

Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 300, 'rf__criterion': 'gini', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 300, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 300, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 300, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 300, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 300, 'rf__criterion': 'entr

Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 500, 'rf__criterion': 'gini', 'rf__max_depth': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 500, 'rf__criterion': 'gini', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 500, 'rf__criterion': 'gini', 'rf__max_depth': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 500, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 500, 'rf__criterion': 'gini', 'rf__max_depth': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2}
Training with parameters: {'pca__n_components': 220, 'rf__n_estimators': 500, 'rf__criterion': 'gini

## Decision Tree (this took longer soooo long)

In [10]:
from sklearn.tree import DecisionTreeClassifier

def flastDTWithThresholds(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization without PCA
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Define the pipeline with PCA and Decision Tree
        pipeline = Pipeline([
            ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
            ('dt', DecisionTreeClassifier(
                criterion=param_dict['dt__criterion'],
                max_depth=param_dict['dt__max_depth'],
                min_samples_split=param_dict['dt__min_samples_split'],
                min_samples_leaf=param_dict['dt__min_samples_leaf'],
                max_features=param_dict['dt__max_features'],
                random_state=42,
            )),
        ])

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Convert to dense format for PCA
            X_train_dense = X_train.toarray()
            X_test_dense = X_test.toarray()

            # Train the model
            pipeline.fit(X_train_dense, y_train)

            # Predict probabilities on test set
            if hasattr(pipeline.named_steps['dt'], "predict_proba"):
                y_pred_proba = pipeline.predict_proba(X_test_dense)
            else:
                # If predict_proba is not available, use decision_function
                y_scores = pipeline.decision_function(X_test_dense)
                y_pred_proba = np.vstack([1 - y_scores, y_scores]).T

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred)

                metrics_per_combination.append({
                    'pca__n_components': param_dict['pca__n_components'],
                    'dt__criterion': param_dict['dt__criterion'],
                    'dt__max_depth': param_dict['dt__max_depth'],
                    'dt__min_samples_split': param_dict['dt__min_samples_split'],
                    'dt__min_samples_leaf': param_dict['dt__min_samples_leaf'],
                    'dt__max_features': param_dict['dt__max_features'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Convert the list of metrics into a DataFrame
    df_results = pd.DataFrame(metrics_per_combination)

    # Save the per-fold results
    outFile = f"{combination_label}-thresholds-dt-results-per-fold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Per-fold Decision Tree analysis completed. Results saved to: {outFile}")

    # Compute the average metrics across folds for each combination of hyperparameters and thresholds
    grouped_metrics = df_results.groupby(
        ['pca__n_components', 'dt__criterion', 'dt__max_depth', 'dt__min_samples_split', 'dt__min_samples_leaf', 'dt__max_features', 'threshold']
    ).agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'mcc': ['mean', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    grouped_metrics.columns = [
        '_'.join(col).strip('_') if col[1] else col[0] 
        for col in grouped_metrics.columns.values
    ]

    # Save the averaged results
    outFileAvg = f"{combination_label}-thresholds-dt-results-averaged.csv"
    grouped_metrics.to_csv(os.path.join(outDir, outFileAvg), index=False)

    print(f"Averaged results saved to: {outFileAvg}")

    return df_results, grouped_metrics

if __name__ == "__main__":

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'pca__n_components': [180,200,220],  
        'dt__criterion': ['gini', 'entropy'], 
        'dt__max_depth': [None, 10, 30, 50, 100, 300, 500],  
        'dt__min_samples_split': [2, 5, 10], 
        'dt__min_samples_leaf': [1, 2, 5, 10],  
        'dt__max_features': [None, 'sqrt', 'log2'],  
    }

    # Paths to your datasets
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/all_nonflaky_files.zip"  # Unbalanced dataset

    # Create result and extract directories
    outDir = "results/dt_thresholds/"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "extracted/dt_thresholds/"
    os.makedirs(extractDir, exist_ok=True)

    # Perform Decision Tree analysis with threshold adjustments
    print("Starting Decision Tree analysis with threshold adjustments...")
    df_results, grouped_metrics = flastDTWithThresholds(
        outDir, flakyZip, nonFlakyZip, extractDir, n_splits=5,
        combination_label="dt_thresholds", param_grid=param_grid)

    if df_results is not None:
        print("Analysis completed. Per-fold Results:")
        print(df_results.head())

        print("Averaged Results:")
        print(grouped_metrics.head())
    else:
        print("Analysis did not produce any results.")


Starting Decision Tree analysis with threshold adjustments...
Number of flaky documents: 45
Number of non-flaky documents: 254
Total number of documents: 299
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_dept

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'gini', 'dt__ma

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criter

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'e

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entrop

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion':

Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 180, 'dt__criterion

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth'

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_de

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt_

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_depth': 500, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'gini', 'dt__max_de

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion':

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'ent

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'e

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 100, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'ent

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'ent

Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 200, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 200, 'dt__criterion':

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini',

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 30, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth':

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 50, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 100, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt_

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', 'dt__max_depth': 300, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'gini', '

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy'

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 10, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 30, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'ent

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 5, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 50, 'dt__min_samples_split': 10, 'dt__min_samples_leaf': 10, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'e

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 2, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 300, 'dt__min_samples_split': 2, 'dt__min_samples_leaf': 5, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'ent

Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 1, 'dt__max_features': 'log2'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': None}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'entropy', 'dt__max_depth': 500, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__max_features': 'sqrt'}
Training with parameters: {'pca__n_components': 220, 'dt__criterion': 'ent

In [11]:
import os
import pandas as pd

# Define the models with their directories and abbreviations
models = {
    'Decision Tree': {'dir': 'dt_thresholds', 'abbr': 'dt'},
    'SVM': {'dir': 'svm_thresholds', 'abbr': 'svm'},
    'KNN': {'dir': 'knn_thresholds', 'abbr': 'knn'},
    'Random Forest': {'dir': 'rf_thresholds', 'abbr': 'rf'},
    'Naive Bayes': {'dir': 'nb_thresholds', 'abbr': 'nb'},
    'XGBoost': {'dir': 'xgb_thresholds', 'abbr': 'xgb'}
}

# Base directory where the results are stored
base_results_dir = 'results'

# Lists to store the best per-fold and averaged results
best_per_fold_results = []
best_averaged_results = []

# Iterate over each model
for model_name, model_info in models.items():
    model_dir = model_info['dir']
    model_abbr = model_info['abbr']
    # Paths to the per-fold and averaged results
    per_fold_csv = os.path.join(base_results_dir, model_dir, f'{model_dir}-thresholds-{model_abbr}-results-per-fold.csv')
    averaged_csv = os.path.join(base_results_dir, model_dir, f'{model_dir}-thresholds-{model_abbr}-results-averaged.csv')

    # Check if the per-fold results file exists
    if os.path.exists(per_fold_csv):
        # Read the per-fold results
        df_per_fold = pd.read_csv(per_fold_csv)
        if not df_per_fold.empty:
            # Find the row with the highest F1 score
            best_row_per_fold = df_per_fold.loc[df_per_fold['f1'].idxmax()]
            # Add model name to the results
            best_row_per_fold['Model'] = model_name
            # Append to the list
            best_per_fold_results.append(best_row_per_fold)
        else:
            print(f"Per-fold results for {model_name} are empty.")
    else:
        print(f"Per-fold results file for {model_name} does not exist: {per_fold_csv}")

    # Check if the averaged results file exists
    if os.path.exists(averaged_csv):
        # Read the averaged results
        df_averaged = pd.read_csv(averaged_csv)
        if not df_averaged.empty:
            # Find the row with the highest mean F1 score
            # Assuming the columns are named like 'f1_mean' for mean and 'f1_std' for std deviation
            best_row_averaged = df_averaged.loc[df_averaged['f1_mean'].idxmax()]
            # Add model name to the results
            best_row_averaged['Model'] = model_name
            # Append to the list
            best_averaged_results.append(best_row_averaged)
        else:
            print(f"Averaged results for {model_name} are empty.")
    else:
        print(f"Averaged results file for {model_name} does not exist: {averaged_csv}")

# Convert the lists to DataFrames
if best_per_fold_results:
    df_best_per_fold = pd.DataFrame(best_per_fold_results)
    # Reorder columns to have 'Model' first
    cols = ['Model'] + [col for col in df_best_per_fold.columns if col != 'Model']
    df_best_per_fold = df_best_per_fold[cols]
    # Save the best per-fold results to a CSV file
    df_best_per_fold.to_csv('best_per_fold_results.csv', index=False)
    print("Best per-fold results saved to 'best_per_fold_results.csv'")
else:
    print("No per-fold results to save.")

if best_averaged_results:
    df_best_averaged = pd.DataFrame(best_averaged_results)
    # Reorder columns to have 'Model' first
    cols = ['Model'] + [col for col in df_best_averaged.columns if col != 'Model']
    df_best_averaged = df_best_averaged[cols]
    # Save the best averaged results to a CSV file
    df_best_averaged.to_csv('best_averaged_results.csv', index=False)
    print("Best averaged results saved to 'best_averaged_results.csv'")
else:
    print("No averaged results to save.")


Per-fold results file for Naive Bayes does not exist: results\nb_thresholds\nb_thresholds-thresholds-nb-results-per-fold.csv
Averaged results file for Naive Bayes does not exist: results\nb_thresholds\nb_thresholds-thresholds-nb-results-averaged.csv
Best per-fold results saved to 'best_per_fold_results.csv'
Best averaged results saved to 'best_averaged_results.csv'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_row_per_fold['Model'] = model_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_row_per_fold['Model'] = model_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_row_averaged['Model'] = model_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_row_averaged['Model'] = model_name
A value 