In [1]:
import os
import time
import zipfile
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import os
import time
import math
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import SparseRandomProjection
import json


###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

###############################################################################
# Data Extraction and Vectorization

# Parameters setup
flakyZip = 'Dataset/flaky_files.zip'
nonFlakyZip = 'Dataset/reduced_nonflaky_files.zip'
imbalanceNonFlakyZip = 'Dataset/nonflaky_files.zip'


#save best hyperparameter
best_hyperparameter = 'results/best_hyperparameter'
os.makedirs(best_hyperparameter,exist_ok=True)

# Create directories
outDirbalance = "results/balance/"
outDirimbalance = "results/imbalance/"
os.makedirs(outDirbalance, exist_ok=True)
os.makedirs(outDirimbalance, exist_ok=True)

extractDirbalance = "extracted/balance/"
extractDirimbalance = "extracted/imbalance/"
os.makedirs(extractDirbalance, exist_ok=True)
os.makedirs(extractDirimbalance, exist_ok=True)

# Extract and read data once for balance combination
flakyDirbalance = os.path.join(extractDirbalance, 'flaky')
nonFlakyDirbalance = os.path.join(extractDirbalance, 'nonFlaky')
os.makedirs(flakyDirbalance, exist_ok=True)
os.makedirs(nonFlakyDirbalance, exist_ok=True)

extract_zip(flakyZip, flakyDirbalance)
extract_zip(nonFlakyZip, nonFlakyDirbalance)

dataPointsFlakybalance = getDataPoints(flakyDirbalance)
dataPointsNonFlakybalance = getDataPoints(nonFlakyDirbalance)
dataPointsbalance = dataPointsFlakybalance + dataPointsNonFlakybalance

# Print the number of datasets for balance combination
print(f"Number of flaky documents (balance combination): {len(dataPointsFlakybalance)}")
print(f"Number of non-flaky documents (balance combination): {len(dataPointsNonFlakybalance)}")
print(f"Total number of documents (balance combination): {len(dataPointsbalance)}")

dataLabelsListbalance = np.array([1]*len(dataPointsFlakybalance) + [0]*len(dataPointsNonFlakybalance))

# Extract and read data once for imbalance non-flaky combination
flakyDirimbalance = os.path.join(extractDirimbalance, 'flaky')
nonFlakyDirimbalance = os.path.join(extractDirimbalance, 'nonFlaky')
os.makedirs(flakyDirimbalance, exist_ok=True)
os.makedirs(nonFlakyDirimbalance, exist_ok=True)

extract_zip(flakyZip, flakyDirimbalance)
extract_zip(imbalanceNonFlakyZip, nonFlakyDirimbalance)

dataPointsFlakyimbalance = getDataPoints(flakyDirimbalance)
dataPointsNonFlakyimbalance = getDataPoints(nonFlakyDirimbalance)
dataPointsimbalance = dataPointsFlakyimbalance + dataPointsNonFlakyimbalance

# Print the number of datasets for imbalance combination
print(f"Number of flaky documents (imbalance combination): {len(dataPointsFlakyimbalance)}")
print(f"Number of non-flaky documents (imbalance combination): {len(dataPointsNonFlakyimbalance)}")
print(f"Total number of documents (imbalance combination): {len(dataPointsimbalance)}")

dataLabelsListimbalance = np.array([1]*len(dataPointsFlakyimbalance) + [0]*len(dataPointsNonFlakyimbalance))


Number of flaky documents (balance combination): 45
Number of non-flaky documents (balance combination): 45
Total number of documents (balance combination): 90
Number of flaky documents (imbalance combination): 45
Number of non-flaky documents (imbalance combination): 243
Total number of documents (imbalance combination): 288


## KNN ##

In [2]:

from sklearn.neighbors import KNeighborsClassifier


# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runKNN(dataPoints, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Define the pipeline with CountVectorizer, PCA, and KNN
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),
        ('pca', PCA(random_state=42)),
        ('knn', KNeighborsClassifier())
    ])

    # Parameter grid for hyperparameter tuning
    if combination_label == "balance":
        param_grid = {
            'pca__n_components': [50, 60, 65],  # Number of components for PCA
            'knn__n_neighbors': [3, 5, 7, 9],
            'knn__metric': ['cosine', 'euclidean'],
            'knn__weights': ['uniform', 'distance'],
        }
    else:
        param_grid = {
            'pca__n_components': [150, 180, 200, 220],  # Number of components for PCA
            'knn__n_neighbors': [3, 5, 7, 9],
            'knn__metric': ['cosine', 'euclidean'],
            'knn__weights': ['uniform', 'distance'],
        }
        


    # Custom scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,  # MCC score custom function
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',  # Optimize the model based on F1 score
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(dataPoints, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_


    # Get the index of the best parameter combination
    best_index = grid_search.best_index_
    # Get the standard deviation of F1 score for the best parameter combination
    std_f1_best = grid_search.cv_results_['std_test_f1'][best_index]

    if combination_label == "imbalance":
        best_params_file = os.path.join(best_hyperparameter,  f"best_params_knn.json")
        with open(best_params_file, 'w') as f:
            json.dump(best_params, f)
        print(f"Best hyperparameters saved to: {best_params_file}")
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score} (Std Dev: {std_f1_best})")

   

    # Save the results
    outFile = f"knn-results-vanilla.csv"
    os.makedirs(outDir, exist_ok=True)
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Updated CSV header to include standard deviation columns
        fo.write("pca_n_components,n_neighbors,metric,weights,accuracy,std_accuracy,precision,std_precision,recall,std_recall,f1,std_f1,mcc,std_mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            std_accuracy = grid_search.cv_results_['std_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            std_precision = grid_search.cv_results_['std_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            std_recall = grid_search.cv_results_['std_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            std_f1 = grid_search.cv_results_['std_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            std_mcc = grid_search.cv_results_['std_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            # Write the parameters, mean scores, and standard deviations to the CSV
            fo.write(f"{param['pca__n_components']},{param['knn__n_neighbors']},{param['knn__metric']},{param['knn__weights']},"
                     f"{accuracy},{std_accuracy},{precision},{std_precision},{recall},{std_recall},{f1},{std_f1},{mcc},{std_mcc},{preparationTime}\n")

    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    # Return std_f1_best along with other results
    return best_params, best_score, std_f1_best

# Run KNN on balance combination
print("\nStarting KNN analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_knn_balance, best_score_5folds_knn_balance, std_f1_best_balance = runKNN(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5, "balance"
)

# Display results
print("\nBest results for KNN 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_knn_balance}")
print(f"Best F1 Score: {best_score_5folds_knn_balance} (Std Dev: {std_f1_best_balance})")

# Run KNN on imbalance non-flaky combination
print("\nStarting KNN analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_knn_imbalance, best_score_5folds_knn_imbalance, std_f1_best_imbalance = runKNN(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5, "imbalance"
)

# Display results
print("\nBest results for KNN 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_knn_imbalance}")
print(f"Best F1 Score: {best_score_5folds_knn_imbalance} (Std Dev: {std_f1_best_imbalance})")





Starting KNN analysis for flaky vs smaller non-flaky files (balance combination)...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 7, 'knn__weights': 'distance', 'pca__n_components': 50}
Best F1 Score: 0.6900653594771242 (Std Dev: 0.10649094843383755)
KNN analysis completed for 5-folds. Results saved to: knn-results-vanilla.csv

Best results for KNN 5-fold on balance combination:
Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 7, 'knn__weights': 'distance', 'pca__n_components': 50}
Best F1 Score: 0.6900653594771242 (Std Dev: 0.10649094843383755)

Starting KNN analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best hyperparameters saved to: results/best_hyperparameter\best_params_knn.json
Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'pca__n_components':

## SVM

In [3]:
from sklearn.svm import SVC

# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)
def runSVM(dataPoints, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Define the pipeline with CountVectorizer, PCA, and SVM
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),
        ('pca', PCA(random_state=42)),
        ('svm', SVC(probability=True, random_state=42))  # Enable probability estimates for threshold tuning
    ])

    # Parameter grid for hyperparameter tuning
    if combination_label == "balance":
        param_grid = {
            'pca__n_components': [50, 60, 65],  # Number of principal components
            'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
            'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel types
        }
    else:
        param_grid = {
            'pca__n_components': [150, 180, 200, 220],
            'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
            'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel types
        }

    # Custom scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,  # MCC score custom function
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(dataPoints, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Get the index of the best parameter combination
    best_index = grid_search.best_index_
    # Get the standard deviation of F1 score for the best parameter combination
    std_f1_best = grid_search.cv_results_['std_test_f1'][best_index]

    if combination_label == "imbalance":
        best_params_file = os.path.join(best_hyperparameter,  f"best_params_svm.json")
        with open(best_params_file, 'w') as f:
            json.dump(best_params, f)
        print(f"Best hyperparameters saved to: {best_params_file}")
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score} (Std Dev: {std_f1_best})")
  

    # Save the results
    outFile = f"svm-results-vanilla.csv"
    os.makedirs(outDir, exist_ok=True)
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Updated CSV header to include standard deviation columns
        fo.write("pca_n_components,C,kernel,accuracy,std_accuracy,precision,std_precision,recall,std_recall,f1,std_f1,mcc,std_mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            std_accuracy = grid_search.cv_results_['std_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            std_precision = grid_search.cv_results_['std_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            std_recall = grid_search.cv_results_['std_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            std_f1 = grid_search.cv_results_['std_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            std_mcc = grid_search.cv_results_['std_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            # Write the parameters, mean scores, and standard deviations to the CSV
            fo.write(f"{param['pca__n_components']},{param['svm__C']},{param['svm__kernel']},"
                     f"{accuracy},{std_accuracy},{precision},{std_precision},{recall},{std_recall},"
                     f"{f1},{std_f1},{mcc},{std_mcc},{preparationTime}\n")

    print(f"SVM analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    # Return std_f1_best along with other results
    return best_params, best_score, std_f1_best
# Run SVM on balance combination
print("\nStarting SVM analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_svm_balance, best_score_5folds_svm_balance, std_f1_best_balance = runSVM(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5, "balance"
)

# Display results
print("\nBest results for SVM 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_svm_balance}")
print(f"Best F1 Score: {best_score_5folds_svm_balance} (Std Dev: {std_f1_best_balance})")

# Run SVM on imbalance non-flaky combination
print("\nStarting SVM analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_svm_imbalance, best_score_5folds_svm_imbalance, std_f1_best_imbalance = runSVM(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5, "imbalance"
)

# Display results
print("\nBest results for SVM 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_svm_imbalance}")
print(f"Best F1 Score: {best_score_5folds_svm_imbalance} (Std Dev: {std_f1_best_imbalance})")



Starting SVM analysis for flaky vs smaller non-flaky files (balance combination)...
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'pca__n_components': 65, 'svm__C': 100.0, 'svm__kernel': 'rbf'}
Best F1 Score: 0.8239766081871345 (Std Dev: 0.10094645473774816)
SVM analysis completed for 5-folds. Results saved to: svm-results-vanilla.csv

Best results for SVM 5-fold on balance combination:
Best Parameters: {'pca__n_components': 65, 'svm__C': 100.0, 'svm__kernel': 'rbf'}
Best F1 Score: 0.8239766081871345 (Std Dev: 0.10094645473774816)

Starting SVM analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best hyperparameters saved to: results/best_hyperparameter\best_params_svm.json
Best Parameters: {'pca__n_components': 220, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best F1 Score: 0.6914602683178535 (Std Dev: 0.12792567952776687)
SVM analysis completed for 5-folds. Results sav

## XGB

In [None]:

from xgboost import XGBClassifier


# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runXGB(dataPoints, dataLabelsList, outDir, n_splits, combination_label):
    print(f"Starting runXGB with combination_label: {combination_label}")
    v0 = time.perf_counter()

    # Define the pipeline with CountVectorizer, PCA, and XGBoost
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),
        ('pca', PCA(random_state=42)),
        ('xgb', XGBClassifier(
            eval_metric='logloss',  
            use_label_encoder=False, 
            random_state=42
        ))
    ])

    # Parameter grid for hyperparameter tuning
    if combination_label == "balance":
        param_grid = {
            'pca__n_components': [50, 60, 65],
            'xgb__n_estimators': [100, 150, 200],
            'xgb__max_depth': [3, 5, 7, 10],
            'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],
        }
    else:
        param_grid = {
            'pca__n_components': [150, 180, 200, 220],
            'xgb__n_estimators': [100, 150, 200],
            'xgb__max_depth': [3, 5, 7, 10],
            'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],
        }

    print(f"Parameter grid for {combination_label}: {param_grid}")

    # Custom scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(dataPoints, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Get the index of the best parameter combination
    best_index = grid_search.best_index_
    # Get the standard deviation of F1 score for the best parameter combination
    std_f1_best = grid_search.cv_results_['std_test_f1'][best_index]

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score} (Std Dev: {std_f1_best})")
    if combination_label == "imbalance":
        best_params_file = os.path.join(best_hyperparameter,  f"best_params_xgb.json")
        with open(best_params_file, 'w') as f:
            json.dump(best_params, f)
        print(f"Best hyperparameters saved to: {best_params_file}")
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score} (Std Dev: {std_f1_best})")
    # Save the results
    outFile = f"xgb-results-vanilla.csv"
    os.makedirs(outDir, exist_ok=True)
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Update CSV header to include standard deviation columns
        fo.write("pca_n_components,n_estimators,max_depth,learning_rate,accuracy,std_accuracy,precision,std_precision,recall,std_recall,f1,std_f1,mcc,std_mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            std_accuracy = grid_search.cv_results_['std_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            std_precision = grid_search.cv_results_['std_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            std_recall = grid_search.cv_results_['std_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            std_f1 = grid_search.cv_results_['std_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            std_mcc = grid_search.cv_results_['std_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            # Write the parameters, mean scores, and standard deviations to the CSV
            fo.write(f"{param['pca__n_components']},{param['xgb__n_estimators']},{param['xgb__max_depth']},{param['xgb__learning_rate']},"
                     f"{accuracy},{std_accuracy},{precision},{std_precision},{recall},{std_recall},{f1},{std_f1},{mcc},{std_mcc},{preparationTime}\n")

    print(f"XGBoost analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    # Return std_f1_best along with other results
    return best_params, best_score, std_f1_best

print("\nStarting XGBoost analysis for flaky vs balance non-flaky files (balance combination)...")
best_params_5folds_xgb_balance, best_score_5folds_xgb_balance, std_f1_best_balance = runXGB(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5, "balance"
)
# Display results
print("\nBest results for XGBoost 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_xgb_balance}")
print(f"Best F1 Score: {best_score_5folds_xgb_balance} (Std Dev: {std_f1_best_balance})")

# Run XGBoost on imbalance combination
print("\nStarting XGBoost analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_xgb_imbalance, best_score_5folds_xgb_imbalance, std_f1_best_imbalance = runXGB(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5, "imbalance"
)

# Display results
print("\nBest results for XGBoost 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_xgb_imbalance}")
print(f"Best F1 Score: {best_score_5folds_xgb_imbalance} (Std Dev: {std_f1_best_imbalance})")



Starting XGBoost analysis for flaky vs balance non-flaky files (balance combination)...
Starting runXGB with combination_label: balance
Parameter grid for balance: {'pca__n_components': [50, 60, 65], 'xgb__n_estimators': [100, 150, 200], 'xgb__max_depth': [3, 5, 7, 10], 'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5]}
Fitting 5 folds for each of 144 candidates, totalling 720 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'pca__n_components': 50, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 200}
Best F1 Score: 0.7701589865541946 (Std Dev: 0.07448215535694924)
Best Parameters: {'pca__n_components': 50, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 200}
Best F1 Score: 0.7701589865541946 (Std Dev: 0.07448215535694924)
XGBoost analysis completed for 5-folds. Results saved to: xgb-results-vanilla.csv

Best results for XGBoost 5-fold on balance combination:
Best Parameters: {'pca__n_components': 50, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 200}
Best F1 Score: 0.7701589865541946 (Std Dev: 0.07448215535694924)

Starting XGBoost analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Starting runXGB with combination_label: imbalance
Parameter grid for imbalance: {'pca__n_components': [150, 180, 200, 220], 'xgb__n_estimators': [100, 150, 200], 'xgb__max_depth': [3, 5, 7, 10], 'xgb__learning_rate': [0.01

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'pca__n_components': 150, 'xgb__learning_rate': 0.3, 'xgb__max_depth': 5, 'xgb__n_estimators': 150}
Best F1 Score: 0.5822510822510822 (Std Dev: 0.13133374400163644)
Best hyperparameters saved to: results/best_hyperparameter\imbalance_best_params_xgb.json
Best Parameters: {'pca__n_components': 150, 'xgb__learning_rate': 0.3, 'xgb__max_depth': 5, 'xgb__n_estimators': 150}
Best F1 Score: 0.5822510822510822 (Std Dev: 0.13133374400163644)
XGBoost analysis completed for 5-folds. Results saved to: xgb-results-vanilla.csv

Best results for XGBoost 5-fold on imbalance combination:
Best Parameters: {'pca__n_components': 150, 'xgb__learning_rate': 0.3, 'xgb__max_depth': 5, 'xgb__n_estimators': 150}
Best F1 Score: 0.5822510822510822 (Std Dev: 0.13133374400163644)


Parameters: { "use_label_encoder" } are not used.



## Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier


###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# Random Forest

def runRandomForest(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()

    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline with CountVectorizer and Random Forest
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer
        ('rf', RandomForestClassifier(random_state=42))    # Random Forest classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'rf__n_estimators': [50, 100, 200],                 # Number of trees in the forest
        'rf__max_depth': [10, 20, 30],                      # Maximum depth of the tree
        'rf__min_samples_split': [5, 10],                   # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [2, 5],                     # Minimum number of samples required at a leaf node
        'rf__criterion': ['gini', 'entropy'],               # Function to measure the quality of a split
        # 'rf__class_weight': ['balanced']                    # Class weights
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_
    best_score = grid_search.best_score_

    # Get the index of the best parameter combination
    best_index = grid_search.best_index_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Ensure the output directory exists
    os.makedirs(outDir, exist_ok=True)

    # Save best parameters to JSON
    best_params_file = os.path.join(best_hyperparameter, "best_params_rf.json")
    with open(best_params_file, 'w') as f:
        json.dump(best_params, f)
    print(f"Best hyperparameters saved to: {best_params_file}")

    # Extract the cross-validation results
    results = grid_search.cv_results_

    # Extract standard deviations of the best scores
    std_f1_best = results['std_test_f1'][best_index]
    std_mcc_best = results['std_test_mcc'][best_index]

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({
            'n_estimators': results['params'][idx].get('rf__n_estimators'),
            'max_depth': results['params'][idx].get('rf__max_depth'),
            'min_samples_split': results['params'][idx].get('rf__min_samples_split'),
            'min_samples_leaf': results['params'][idx].get('rf__min_samples_leaf'),
            'criterion': results['params'][idx].get('rf__criterion'),
            # 'class_weight': results['params'][idx].get('rf__class_weight'),
            'accuracy': results.get('mean_test_accuracy', [None])[idx],
            'std_accuracy': results.get('std_test_accuracy', [None])[idx],
            'precision': results.get('mean_test_precision', [None])[idx],
            'std_precision': results.get('std_test_precision', [None])[idx],
            'recall': results.get('mean_test_recall', [None])[idx],
            'std_recall': results.get('std_test_recall', [None])[idx],
            'f1': results.get('mean_test_f1', [None])[idx],
            'std_f1': results.get('std_test_f1', [None])[idx],
            'mcc': results.get('mean_test_mcc', [None])[idx],
            'std_mcc': results.get('std_test_mcc', [None])[idx],
        })

    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][best_index]
    std_final_f1 = results['std_test_f1'][best_index]
    final_precision = results['mean_test_precision'][best_index]
    std_final_precision = results['std_test_precision'][best_index]
    final_recall = results['mean_test_recall'][best_index]
    std_final_recall = results['std_test_recall'][best_index]
    final_accuracy = results['mean_test_accuracy'][best_index]
    std_final_accuracy = results['std_test_accuracy'][best_index]
    final_mcc = results['mean_test_mcc'][best_index]
    std_final_mcc = results['std_test_mcc'][best_index]

    # Save the results to a CSV file
    outFile = "rf-results-vanilla.csv"
    os.makedirs(outDir, exist_ok=True)
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Update CSV header to include standard deviation columns
        fo.write("n_estimators,max_depth,min_samples_split,min_samples_leaf,criterion,accuracy,std_accuracy,precision,std_precision,recall,std_recall,f1,std_f1,mcc,std_mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            n_estimators = param['rf__n_estimators']
            max_depth = param['rf__max_depth']
            min_samples_split = param['rf__min_samples_split']
            min_samples_leaf = param['rf__min_samples_leaf']
            criterion = param['rf__criterion']
            accuracy = results['mean_test_accuracy'][idx]
            std_accuracy = results['std_test_accuracy'][idx]
            precision = results['mean_test_precision'][idx]
            std_precision = results['std_test_precision'][idx]
            recall = results['mean_test_recall'][idx]
            std_recall = results['std_test_recall'][idx]
            f1 = results['mean_test_f1'][idx]
            std_f1 = results['std_test_f1'][idx]
            mcc = results['mean_test_mcc'][idx]
            std_mcc = results['std_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            # Write the parameters, mean scores, and standard deviations to the CSV
            fo.write(f"{n_estimators},{max_depth},{min_samples_split},{min_samples_leaf},{criterion},"
                     f"{accuracy},{std_accuracy},{precision},{std_precision},{recall},{std_recall},{f1},{std_f1},{mcc},{std_mcc},{preparationTime}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds. Results saved to: {outFile}")

    # Return the best parameters and metrics with their standard deviations
    return best_params, final_f1, std_final_f1, final_mcc, std_final_mcc

###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "results"
os.makedirs(outDir, exist_ok=True)

# Run Random Forest on balance combination
print("\nStarting Random Forest analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_balance, best_score_5folds_balance, std_f1_balance, balance_mcc_5folds, std_mcc_balance = runRandomForest(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5
)

print("\nBest results for Random Forest 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_balance}")
print(f"Best F1 Score: {best_score_5folds_balance} (Std Dev: {std_f1_balance})")
print(f"Final MCC: {balance_mcc_5folds} (Std Dev: {std_mcc_balance})")

# Run Random Forest on imbalance combination
print("\nStarting Random Forest analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_imbalance, best_f1_5folds, std_f1_imbalance, final_mcc_5folds, std_mcc_imbalance = runRandomForest(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5
)

# Display results
print("\nBest results for Random Forest 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_imbalance}")
print(f"Best F1 Score: {best_f1_5folds} (Std Dev: {std_f1_imbalance})")
print(f"Final MCC: {final_mcc_5folds} (Std Dev: {std_mcc_imbalance})")



Starting Random Forest analysis for flaky vs smaller non-flaky files (balance combination)...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}
Best F1 Score from cross-validation: 0.8908359133126936
Best hyperparameters saved to: results/best_hyperparameter\best_params_rf.json
Random Forest analysis completed for 5-folds. Results saved to: rf-results-vanilla.csv

Best results for Random Forest 5-fold on balance combination:
Best Parameters: {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}
Best F1 Score: 0.8908359133126936 (Std Dev: 0.050435935677112964)
Final MCC: 0.7856438407434342 (Std Dev: 0.1001822478018566)

Starting Random Forest analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Fitting 5 folds for each of 72 candidat

## Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier


###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# Decision Tree

def runDecisionTree(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()

    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline and Decision Tree
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer
        ('dt', DecisionTreeClassifier(random_state=42))    # Decision Tree classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'dt__max_depth': [10, 20, 30],                     # Maximum depth of the tree
        'dt__min_samples_split': [5, 10],                  # Minimum number of samples required to split a node
        'dt__min_samples_leaf': [2, 5],                    # Minimum number of samples required at a leaf node
        'dt__criterion': ['gini', 'entropy'],              # Function to measure the quality of a split
        'dt__max_features': [None, 'sqrt', 'log2'],        # Controls how many features to consider for splits
        # 'dt__class_weight': ['balanced']                   # Class weights
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring,
        refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    # Get the index of the best parameter combination
    best_index = grid_search.best_index_

    # Extract the cross-validation results
    results = grid_search.cv_results_

    # Extract standard deviations of the best scores
    std_f1_best = results['std_test_f1'][best_index]
    std_mcc_best = results['std_test_mcc'][best_index]

    # Save best parameters to JSON
    best_params_file = os.path.join(best_hyperparameter, "best_params_dt.json")
    with open(best_params_file, 'w') as f:
        json.dump(best_params, f)
    print(f"Best hyperparameters saved to: {best_params_file}")
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv} (Std Dev: {std_f1_best})")

    # Prepare per-fold metrics for CSV along with the parameter combinations
    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][best_index]
    std_final_f1 = results['std_test_f1'][best_index]
    final_precision = results['mean_test_precision'][best_index]
    std_final_precision = results['std_test_precision'][best_index]
    final_recall = results['mean_test_recall'][best_index]
    std_final_recall = results['std_test_recall'][best_index]
    final_accuracy = results['mean_test_accuracy'][best_index]
    std_final_accuracy = results['std_test_accuracy'][best_index]
    final_mcc = results['mean_test_mcc'][best_index]
    std_final_mcc = results['std_test_mcc'][best_index]

    # Save the results to a CSV file
    outFile = f"dt-results-vanilla.csv"
    os.makedirs(outDir, exist_ok=True)
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Update CSV header to include standard deviation columns
        fo.write("max_depth,min_samples_split,min_samples_leaf,criterion,max_features,accuracy,std_accuracy,precision,std_precision,recall,std_recall,f1,std_f1,mcc,std_mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            std_accuracy = grid_search.cv_results_['std_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            std_precision = grid_search.cv_results_['std_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            std_recall = grid_search.cv_results_['std_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            std_f1 = grid_search.cv_results_['std_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            std_mcc = grid_search.cv_results_['std_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            # Write the parameters, mean scores, and standard deviations to the CSV
            fo.write(f"{param['dt__max_depth']},{param['dt__min_samples_split']},{param['dt__min_samples_leaf']},{param['dt__criterion']},{param['dt__max_features']},"
                     f"{accuracy},{std_accuracy},{precision},{std_precision},{recall},{std_recall},{f1},{std_f1},{mcc},{std_mcc},{preparationTime}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds. Results saved to: {outFile}")

    # Return the best parameters and metrics with their standard deviations
    return best_params, final_f1, std_final_f1, final_mcc, std_final_mcc

###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "results"
os.makedirs(outDir, exist_ok=True)

# Run Decision Tree on balance combination
print("\nStarting Decision Tree analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_balance, best_f1_score_balance, std_f1_balance, best_mcc_balance, std_mcc_balance = runDecisionTree(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5
)

print("\nBest results for Decision Tree 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_balance}")
print(f"Best F1 Score: {best_f1_score_balance} (Std Dev: {std_f1_balance})")
print(f"Best MCC Score: {best_mcc_balance} (Std Dev: {std_mcc_balance})")

# Run Decision Tree on imbalance non-flaky combination
print("\nStarting Decision Tree analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_imbalance, best_f1_score_imbalance, std_f1_imbalance, best_mcc_imbalance, std_mcc_imbalance = runDecisionTree(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5
)

print("\nBest results for Decision Tree 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_imbalance}")
print(f"Best F1 Score: {best_f1_score_imbalance} (Std Dev: {std_f1_imbalance})")
print(f"Best MCC Score: {best_mcc_imbalance} (Std Dev: {std_mcc_imbalance})")



Starting Decision Tree analysis for flaky vs smaller non-flaky files (balance combination)...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best hyperparameters saved to: results/best_hyperparameter\best_params_dt.json
Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 5}
Best F1 Score from cross-validation: 0.8829640947288006 (Std Dev: 0.10307657727394873)
Decision Tree analysis completed for 5-folds. Results saved to: dt-results-vanilla.csv

Best results for Decision Tree 5-fold on balance combination:
Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 5}
Best F1 Score: 0.8829640947288006 (Std Dev: 0.10307657727394873)
Best MCC Score: 0.8058403455783832 (Std Dev: 0.15642087867118531)

Starting Decision Tree analysis for flaky vs imbalance non-flaky files (imbalance combination)...
F