In [1]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Vectorizes the data points using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

## KNN ##

In [3]:
def flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define the KNN model
    knn = KNeighborsClassifier()

    # Expanded parameter grid for hyperparameter tuning
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11, 15, 20],
        'metric': ['cosine', 'euclidean'],  # Distance metrics
        'weights': ['uniform', 'distance'],  # Neighbor weighting schemes
    }

    # Custom scoring functions including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),  
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-knn-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_neighbors,metric,weights,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = vecTime / len(dataPoints)  
            fo.write(f"{param['n_neighbors']},{param['metric']},{param['weights']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform KNN analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting KNN analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastKNNWithGridSearchCV(outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, "equal")
    best_params_3folds_1, best_score_3folds_1 = flastKNNWithGridSearchCV(outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, "equal")

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform KNN analysis for the second combination (flaky vs larger non-flaky)
    print("Starting KNN analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastKNNWithGridSearchCV(outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, "larger")
    best_params_3folds_2, best_score_3folds_2 = flastKNNWithGridSearchCV(outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, "larger")

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")






Starting KNN analysis for flaky vs smaller non-flaky files (47 each)...


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe2 in position 37: invalid continuation byte

## SVM

In [None]:
###############################################################################
# SVM with GridSearchCV

def runSVM(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Define the SVM model
    svm = SVC()

    # Parameter grid for hyperparameter tuning
    param_grid = {
        'C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # Kernel types
    }

    # Custom scoring functions (without MCC)
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(svm, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-svm-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("C,kernel,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['C']},{param['kernel']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"SVM analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

## Naive Bayes

In [None]:

###############################################################################
# Naive Bayes with GridSearchCV and Multiple Scoring Metrics

def flastNBWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization without Random Projection
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define the Naive Bayes model
    nb = MultinomialNB()

    # Parameter grid for hyperparameter tuning
    param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0], 
    }

    # Custom scoring functions including MCC
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(nb, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-nb-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("alpha,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = vecTime / len(dataPoints)  
            fo.write(f"{param['alpha']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Naive Bayes analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Naive Bayes analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Naive Bayes analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastNBWithGridSearchCV(outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, "equal")
    best_params_3folds_1, best_score_3folds_1 = flastNBWithGridSearchCV(outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, "equal")

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform Naive Bayes analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Naive Bayes analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastNBWithGridSearchCV(outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, "larger")
    best_params_3folds_2, best_score_3folds_2 = flastNBWithGridSearchCV(outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, "larger")

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")


## Main function

In [None]:

###############################################################################
# Main code

if __name__ == "__main__":
    # Parameters setup
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create directories
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Step 1: Extract and read data once for equal combination
    flakyDirEqual = os.path.join(extractDirEqual, 'flaky')
    nonFlakyDirEqual = os.path.join(extractDirEqual, 'nonFlaky')
    os.makedirs(flakyDirEqual, exist_ok=True)
    os.makedirs(nonFlakyDirEqual, exist_ok=True)

    extract_zip(flakyZip, flakyDirEqual)
    extract_zip(nonFlakyZip, nonFlakyDirEqual)

    dataPointsFlakyEqual = getDataPoints(flakyDirEqual)
    dataPointsNonFlakyEqual = getDataPoints(nonFlakyDirEqual)
    dataPointsEqual = dataPointsFlakyEqual + dataPointsNonFlakyEqual

    dataLabelsListEqual = np.array([1]*len(dataPointsFlakyEqual) + [0]*len(dataPointsNonFlakyEqual))

    # Step 2: Vectorize data once
    Z_equal = flastVectorization(dataPointsEqual)

    # Step 3: Use the preprocessed data for all models
    # KNN on equal combination
    print("Starting KNN analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_knn_equal, best_score_5folds_knn_equal = runKNN(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
    best_params_3folds_knn_equal, best_score_3folds_knn_equal = runKNN(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

    # SVM on equal combination
    print("Starting SVM analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_svm_equal, best_score_5folds_svm_equal = runSVM(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
    best_params_3folds_svm_equal, best_score_3folds_svm_equal = runSVM(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

    # Step 4: Repeat for larger non-flaky combination
    # Extract and read data once
    flakyDirLarger = os.path.join(extractDirLarger, 'flaky')
    nonFlakyDirLarger = os.path.join(extractDirLarger, 'nonFlaky')
    os.makedirs(flakyDirLarger, exist_ok=True)
    os.makedirs(nonFlakyDirLarger, exist_ok=True)

    extract_zip(flakyZip, flakyDirLarger)
    extract_zip(largerNonFlakyZip, nonFlakyDirLarger)

    dataPointsFlakyLarger = getDataPoints(flakyDirLarger)
    dataPointsNonFlakyLarger = getDataPoints(nonFlakyDirLarger)
    dataPointsLarger = dataPointsFlakyLarger + dataPointsNonFlakyLarger

    dataLabelsListLarger = np.array([1]*len(dataPointsFlakyLarger) + [0]*len(dataPointsNonFlakyLarger))

    # Vectorize data once
    Z_larger = flastVectorization(dataPointsLarger)

    # KNN on larger combination
    print("Starting KNN analysis for flaky vs larger non-flaky files...")
    best_params_5folds_knn_larger, best_score_5folds_knn_larger = runKNN(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
    best_params_3folds_knn_larger, best_score_3folds_knn_larger = runKNN(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

    # SVM on larger combination
    print("Starting SVM analysis for flaky vs larger non-flaky files...")
    best_params_5folds_svm_larger, best_score_5folds_svm_larger = runSVM(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
    best_params_3folds_svm_larger, best_score_3folds_svm_larger = runSVM(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

    # Print best results
    # KNN Equal
    print("Best results for KNN 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_knn_equal}")
    print(f"Best F1 Score: {best_score_5folds_knn_equal}")

    print("Best results for KNN 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_knn_equal}")
    print(f"Best F1 Score: {best_score_3folds_knn_equal}")

    # SVM Equal
    print("Best results for SVM 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_svm_equal}")
    print(f"Best F1 Score: {best_score_5folds_svm_equal}")

    print("Best results for SVM 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_svm_equal}")
    print(f"Best F1 Score: {best_score_3folds_svm_equal}")

    # KNN Larger
    print("Best results for KNN 5-fold on larger combination:")
    print(f"Best Parameters: {best_params_5folds_knn_larger}")
    print(f"Best F1 Score: {best_score_5folds_knn_larger}")

    print("Best results for KNN 3-fold on larger combination:")
    print(f"Best Parameters: {best_params_3folds_knn_larger}")
    print(f"Best F1 Score: {best_score_3folds_knn_larger}")

    # SVM Larger
    print("Best results for SVM 5-fold on larger combination:")
    print(f"Best Parameters: {best_params_5folds_svm_larger}")
    print(f"Best F1 Score: {best_score_5folds_svm_larger}")

    print("Best results for SVM 3-fold on larger combination:")
    print(f"Best Parameters: {best_params_3folds_svm_larger}")
    print(f"Best F1 Score: {best_score_3folds_svm_larger}")