KNN

In [9]:
import os
import time
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (make_scorer, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty or invalid file skipped: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def plot_confusion_matrix(y_true, y_pred):
    """Plots confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Flaky', 'Flaky'], yticklabels=['Non-Flaky', 'Flaky'])
    plt.title(f'Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

###############################################################################
# Main Function with Pipeline and GridSearchCV for KNN

def flastKNNWithPipeline(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Create labels: 1 for flaky, 0 for non-flaky
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

    vecTime = time.perf_counter() - v0

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Define a pipeline with CountVectorizer, SMOTE, and KNN
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Include vectorizer in pipeline
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('knn', KNeighborsClassifier())                    # KNN classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'knn__n_neighbors': [3, 5, 7, 9, 11, 15, 20],            # Number of neighbors for KNN
        'knn__weights': ['uniform', 'distance'],                 # Uniform or distance-based weighting
        'knn__metric': ['euclidean', 'cosine']                   # Distance metrics
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Step 1: Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Step 2: Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
      fold_metrics.append({
        'n_neighbors': results['params'][idx].get('knn__n_neighbors'),
        'weights': results['params'][idx].get('knn__weights'),
        'metric': results['params'][idx].get('knn__metric'),
        'accuracy': results['mean_test_accuracy'][idx],
        'precision': results['mean_test_precision'][idx],
        'recall': results['mean_test_recall'][idx],
        'f1': results['mean_test_f1'][idx],
        'mcc': results['mean_test_mcc'][idx],
        'preparationTime': vecTime / len(dataPoints)  # Calculate the preparation time per document
    })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile = os.path.join(outDir, f"knn-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile, index=False)

    print(f"Per-fold metrics saved to: {outFile}")

    # Step 3: Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Step 4: Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile = os.path.join(outDir, f"knn-results-{n_splits}-folds.csv")
    with open(outFile, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"KNN analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile}")

###############################################################################
# Main Execution for Both 3-Fold and 5-Fold
if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    largerNonFlakyZip = "all_nonflaky_files.zip"

    outDir = "smote-results"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "smote-extracted"
    os.makedirs(extractDir, exist_ok=True)

    # Run KNN with Pipeline and GridSearchCV using 3-fold cross-validation
    print("Starting KNN analysis with SMOTE for 3-fold cross-validation...")
    flastKNNWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=3)

    # Run KNN with Pipeline and GridSearchCV using 5-fold cross-validation
    print("Starting KNN analysis with SMOTE for 5-fold cross-validation...")
    flastKNNWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=5)


Starting KNN analysis with SMOTE for 3-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 3 folds for each of 28 candidates, totalling 84 fits
Best Parameters with SMOTE: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__weights': 'distance'}
Best F1 Score from cross-validation: 0.6041771094402673
Per-fold metrics saved to: smote-results/knn-smote-fold-results-3-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.5318077803203661
Final Recall: 0.7041666666666666
Final Accuracy: 0.857062706270627
Final F1 Score: 0.6041771094402673
Final MCC: 0.5281142717308954
KNN analysis completed for 3-folds with SMOTE. Results saved to: smote-results/knn-results-3-folds.csv
Starting KNN analysis with SMOTE for 5-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Paramete

SVM

In [14]:
import os
import time
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import (make_scorer, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty or invalid file skipped: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def plot_confusion_matrix(y_true, y_pred):
    """Plots confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Flaky', 'Flaky'], yticklabels=['Non-Flaky', 'Flaky'])
    plt.title(f'Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

###############################################################################
# Main Function with Pipeline and GridSearchCV for SVM

def flastSVMWithPipeline(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Create labels: 1 for flaky, 0 for non-flaky
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

    vecTime = time.perf_counter() - v0

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Define a pipeline with CountVectorizer, SMOTE, and SVM
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Include vectorizer in pipeline
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('svm', SVC(probability=True))                     # SVM classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],                          # Regularization parameter
        'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']               # Kernel types
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Step 1: Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Step 2: Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
      fold_metrics.append({
        'C': results['params'][idx].get('svm__C'),
        'kernel': results['params'][idx].get('svm__kernel'),
        'accuracy': results['mean_test_accuracy'][idx],
        'precision': results['mean_test_precision'][idx],
        'recall': results['mean_test_recall'][idx],
        'f1': results['mean_test_f1'][idx],
        'mcc': results['mean_test_mcc'][idx],
        'preparationTime': vecTime / len(dataPoints)  # Calculate the preparation time per document
    })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile = os.path.join(outDir, f"svm-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile, index=False)

    print(f"Per-fold metrics saved to: {outFile}")

    # Step 3: Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Step 4: Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile = os.path.join(outDir, f"svm-results-{n_splits}-folds.csv")
    with open(outFile, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"SVM analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile}")

###############################################################################
# Main Execution for Both 3-Fold and 5-Fold
if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    largerNonFlakyZip = "all_nonflaky_files.zip"

    outDir = "smote-results"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "smote-extracted"
    os.makedirs(extractDir, exist_ok=True)

    # Run SVM with Pipeline and GridSearchCV using 3-fold cross-validation
    print("Starting SVM analysis with SMOTE for 3-fold cross-validation...")
    flastSVMWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=3)

    # Run SVM with Pipeline and GridSearchCV using 5-fold cross-validation
    print("Starting SVM analysis with SMOTE for 5-fold cross-validation...")
    flastSVMWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=5)


Starting SVM analysis with SMOTE for 3-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters with SMOTE: {'svm__C': 10.0, 'svm__kernel': 'rbf'}
Best F1 Score from cross-validation: 0.7206861239119303
Per-fold metrics saved to: smote-results/svm-smote-fold-results-3-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.8680555555555555
Final Recall: 0.6402777777777778
Final Accuracy: 0.9235313531353135
Final F1 Score: 0.7206861239119303
Final MCC: 0.698389106490695
SVM analysis completed for 3-folds with SMOTE. Results saved to: smote-results/svm-results-3-folds.csv
Starting SVM analysis with SMOTE for 5-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters with SMOTE: {'svm__C': 10.0, 'svm__ker

NB

In [13]:
import os
import time
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (make_scorer, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty or invalid file skipped: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def plot_confusion_matrix(y_true, y_pred):
    """Plots confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Flaky', 'Flaky'], yticklabels=['Non-Flaky', 'Flaky'])
    plt.title(f'Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

###############################################################################
# Main Function with Pipeline and GridSearchCV for Naive Bayes

def flastNBWithPipeline(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Create labels: 1 for flaky, 0 for non-flaky
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

    vecTime = time.perf_counter() - v0

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Define a pipeline with CountVectorizer, SMOTE, and Naive Bayes
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Include vectorizer in pipeline
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('nb', MultinomialNB())                            # Naive Bayes classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'nb__alpha': [0.001, 0.01, 0.1, 1.0, 10.0]          # Smoothing parameter for Naive Bayes
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Step 1: Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Step 2: Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
      fold_metrics.append({
        'alpha': results['params'][idx].get('nb__alpha'),  # Get the 'alpha' parameter for Naive Bayes
        'accuracy': results['mean_test_accuracy'][idx],
        'precision': results['mean_test_precision'][idx],
        'recall': results['mean_test_recall'][idx],
        'f1': results['mean_test_f1'][idx],
        'mcc': results['mean_test_mcc'][idx],
        'preparationTime': vecTime / len(dataPoints)  # Calculate the preparation time per document
    })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile = os.path.join(outDir, f"nb-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile, index=False)

    print(f"Per-fold metrics saved to: {outFile}")

    # Step 3: Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Step 4: Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile = os.path.join(outDir, f"nb-results-{n_splits}-folds.csv")
    with open(outFile, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"Naive Bayes analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile}")

###############################################################################
# Main Execution for Both 3-Fold and 5-Fold
if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    largerNonFlakyZip = "all_nonflaky_files.zip"

    outDir = "smote-results"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "smote-extracted"
    os.makedirs(extractDir, exist_ok=True)

    # Run Naive Bayes with Pipeline and GridSearchCV using 3-fold cross-validation
    print("Starting Naive Bayes analysis with SMOTE for 3-fold cross-validation...")
    flastNBWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=3)

    # Run Naive Bayes with Pipeline and GridSearchCV using 5-fold cross-validation
    print("Starting Naive Bayes analysis with SMOTE for 5-fold cross-validation...")
    flastNBWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=5)


Starting Naive Bayes analysis with SMOTE for 3-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters with SMOTE: {'nb__alpha': 1.0}
Best F1 Score from cross-validation: 0.6724137931034483
Per-fold metrics saved to: smote-results/nb-smote-fold-results-3-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.7402319902319903
Final Recall: 0.6180555555555555
Final Accuracy: 0.9068316831683169
Final F1 Score: 0.6724137931034483
Final MCC: 0.6229824203893163
Naive Bayes analysis completed for 3-folds with SMOTE. Results saved to: smote-results/nb-results-3-folds.csv
Starting Naive Bayes analysis with SMOTE for 5-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters with SMOTE: {'nb__alpha': 0.1}
Best F1

Decision Tree

In [15]:
import os
import time
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (make_scorer, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty or invalid file skipped: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def plot_confusion_matrix(y_true, y_pred):
    """Plots confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Flaky', 'Flaky'], yticklabels=['Non-Flaky', 'Flaky'])
    plt.title(f'Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

###############################################################################
# Main Function with Pipeline and GridSearchCV for Decision Tree

def flastDTWithPipeline(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Create labels: 1 for flaky, 0 for non-flaky
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

    vecTime = time.perf_counter() - v0

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Define a pipeline with CountVectorizer, SMOTE, and Decision Tree
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Include vectorizer in pipeline
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('dt', DecisionTreeClassifier(random_state=42))    # Decision Tree classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'dt__criterion': ['gini', 'entropy'],               # Function to measure the quality of a split
        'dt__max_depth': [None, 10, 30, 50, 100],       # Maximum depth of the tree
        'dt__min_samples_split': [2, 5, 10],                    # Minimum number of samples required to split a node
        'dt__min_samples_leaf': [1, 2, 5, 10],                     # Minimum number of samples required to be at a leaf node
        'dt__max_features': [None, 'sqrt', 'log2']          # Number of features to consider when looking for the best split
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Step 1: Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Step 2: Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
      fold_metrics.append({
        'criterion': results['params'][idx].get('dt__criterion'),
        'max_depth': results['params'][idx].get('dt__max_depth'),
        'min_samples_split': results['params'][idx].get('dt__min_samples_split'),
        'min_samples_leaf': results['params'][idx].get('dt__min_samples_leaf'),
        'max_features': results['params'][idx].get('dt__max_features'),
        'accuracy': results['mean_test_accuracy'][idx],
        'precision': results['mean_test_precision'][idx],
        'recall': results['mean_test_recall'][idx],
        'f1': results['mean_test_f1'][idx],
        'mcc': results['mean_test_mcc'][idx],
        'preparationTime': vecTime / len(dataPoints)
    })
    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile = os.path.join(outDir, f"dt-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile, index=False)

    print(f"Per-fold metrics saved to: {outFile}")

    # Step 3: Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Step 4: Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile = os.path.join(outDir, f"dt-results-{n_splits}-folds.csv")
    with open(outFile, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile}")

###############################################################################
# Main Execution for Both 3-Fold and 5-Fold
if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    largerNonFlakyZip = "all_nonflaky_files.zip"

    outDir = "smote-results"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "smote-extracted"
    os.makedirs(extractDir, exist_ok=True)

    # Run Decision Tree with Pipeline and GridSearchCV using 3-fold cross-validation
    print("Starting Decision Tree analysis with SMOTE for 3-fold cross-validation...")
    flastDTWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=3)

    # Run Decision Tree with Pipeline and GridSearchCV using 5-fold cross-validation
    print("Starting Decision Tree analysis with SMOTE for 5-fold cross-validation...")
    flastDTWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=5)


Starting Decision Tree analysis with SMOTE for 3-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 3 folds for each of 360 candidates, totalling 1080 fits
Best Parameters with SMOTE: {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__max_features': None, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 2}
Best F1 Score from cross-validation: 0.8915925101965146
Per-fold metrics saved to: smote-results/dt-smote-fold-results-3-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.9166666666666666
Final Recall: 0.8736111111111112
Final Accuracy: 0.9667986798679867
Final F1 Score: 0.8915925101965146
Final MCC: 0.8744409272444104
Decision Tree analysis completed for 3-folds with SMOTE. Results saved to: smote-results/dt-results-3-folds.csv
Starting Decision Tree analysis with SMOTE for 5-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of document

Random Forest

In [7]:
import os
import time
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (make_scorer, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty or invalid file skipped: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def plot_confusion_matrix(y_true, y_pred):
    """Plots confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Flaky', 'Flaky'], yticklabels=['Non-Flaky', 'Flaky'])
    plt.title(f'Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

###############################################################################
# Main Function with Pipeline and GridSearchCV for Random Forest

def flastRFWithPipeline(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Create labels: 1 for flaky, 0 for non-flaky
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

    vecTime = time.perf_counter() - v0

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Define a pipeline with CountVectorizer, SMOTE, and Random Forest
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Include vectorizer in pipeline
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('rf', RandomForestClassifier(random_state=42))    # Random Forest classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'rf__n_estimators': [10, 50, 100],                  # Number of trees in the forest
        'rf__max_depth': [10, 30, 50],                      # Maximum depth of the tree
        'rf__min_samples_split': [2, 5],                    # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [1, 2],                     # Minimum number of samples required to be at a leaf node
        'rf__criterion': ["gini", "entropy"]                # Function to measure the quality of a split
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Step 1: Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Step 2: Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combination
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({
            'max_depth': results['params'][idx].get('rf__max_depth'),
            'criterion': results['params'][idx].get('rf__criterion'),
            'min_samples_split': results['params'][idx].get('rf__min_samples_split'),
            'min_samples_leaf': results['params'][idx].get('rf__min_samples_leaf'),
            'n_estimators': results['params'][idx].get('rf__n_estimators'),
            'accuracy': results['mean_test_accuracy'][idx],
            'precision': results['mean_test_precision'][idx],
            'recall': results['mean_test_recall'][idx],
            'f1': results['mean_test_f1'][idx],
            'mcc': results['mean_test_mcc'][idx],
            'preparationTime': vecTime / len(dataPoints)
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile = os.path.join(outDir, f"rf-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile, index=False)

    print(f"Per-fold metrics saved to: {outFile}")

    # Step 3: Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Step 4: Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile = os.path.join(outDir, f"rf-smote-results-{n_splits}-folds.csv")
    with open(outFile, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile}")

###############################################################################
# Main Execution for Both 3-Fold and 5-Fold
if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    largerNonFlakyZip = "all_nonflaky_files.zip"

    outDir = "smote-results"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "smote-extracted"
    os.makedirs(extractDir, exist_ok=True)

    # Run Random Forest with Pipeline and GridSearchCV using 3-fold cross-validation
    print("Starting Random Forest analysis with SMOTE for 3-fold cross-validation...")
    flastRFWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=3)

    # Run Random Forest with Pipeline and GridSearchCV using 5-fold cross-validation
    print("Starting Random Forest analysis with SMOTE for 5-fold cross-validation...")
    flastRFWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=5)


Starting Random Forest analysis with SMOTE for 3-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 3 folds for each of 72 candidates, totalling 216 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters with SMOTE: {'rf__criterion': 'entropy', 'rf__max_depth': 30, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 10}
Best F1 Score from cross-validation: 0.8534562211981567
Per-fold metrics saved to: smote-results/rf-smote-fold-results-3-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.9045177045177045
Final Recall: 0.8083333333333332
Final Accuracy: 0.9568646864686468
Final F1 Score: 0.8534562211981567
Final MCC: 0.8302348933402532
Random Forest analysis completed for 3-folds with SMOTE. Results saved to: smote-results/rf-smote-results-3-folds.csv
Starting Random Forest analysis with SMOTE for 5-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 5 folds for each of 72 candidates, totalling 360 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters with SMOTE: {'rf__criterion': 'gini', 'rf__max_depth': 30, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 50}
Best F1 Score from cross-validation: 0.8325421396628826
Per-fold metrics saved to: smote-results/rf-smote-fold-results-5-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.8605555555555554
Final Recall: 0.8111111111111111
Final Accuracy: 0.950327868852459
Final F1 Score: 0.8325421396628826
Final MCC: 0.8057316396502678
Random Forest analysis completed for 5-folds with SMOTE. Results saved to: smote-results/rf-smote-results-5-folds.csv


XGB

In [8]:
import os
import time
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import (make_scorer, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty or invalid file skipped: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def plot_confusion_matrix(y_true, y_pred):
    """Plots confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Flaky', 'Flaky'], yticklabels=['Non-Flaky', 'Flaky'])
    plt.title(f'Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

###############################################################################
# Main Function with Pipeline and GridSearchCV for XGBoost

def flastXGBWithPipeline(outDir, flakyZip, nonFlakyZip, extractDir, n_splits):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Create labels: 1 for flaky, 0 for non-flaky
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

    vecTime = time.perf_counter() - v0

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Define a pipeline with CountVectorizer, SMOTE, and XGBoost
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Include vectorizer in pipeline
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42))  # XGBoost classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'xgb__n_estimators': [150, 100, 200, 300],                   # Number of boosting rounds
        'xgb__max_depth': [3, 5, 7, 10],                        # Maximum depth of a tree
        'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],                 # Learning rate

    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Step 1: Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Step 2: Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    fold_metrics = []
    for fold_idx in range(n_splits):
        fold_metrics.append({
            'fold': fold_idx + 1,
            'accuracy': results[f'split{fold_idx}_test_accuracy'][grid_search.best_index_],
            'precision': results[f'split{fold_idx}_test_precision'][grid_search.best_index_],
            'recall': results[f'split{fold_idx}_test_recall'][grid_search.best_index_],
            'f1': results[f'split{fold_idx}_test_f1'][grid_search.best_index_],
            'mcc': results[f'split{fold_idx}_test_mcc'][grid_search.best_index_]
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile = os.path.join(outDir, f"xgb-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile, index=False)

    print(f"Per-fold metrics saved to: {outFile}")

    # Step 3: Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Step 4: Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile = os.path.join(outDir, f"xgb-results-{n_splits}-folds.csv")
    with open(outFile, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"XGBoost analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile}")

###############################################################################
# Main Execution for Both 3-Fold and 5-Fold
if __name__ == "__main__":
    # Parameters setup
    flakyZip = "cleaned_flaky_files.zip"
    largerNonFlakyZip = "all_nonflaky_files.zip"

    outDir = "smote-results"
    os.makedirs(outDir, exist_ok=True)
    extractDir = "smote-extracted"
    os.makedirs(extractDir, exist_ok=True)

    # Run XGBoost with Pipeline and GridSearchCV using 3-fold cross-validation
    print("Starting XGBoost analysis with SMOTE for 3-fold cross-validation...")
    flastXGBWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=3)

    # Run XGBoost with Pipeline and GridSearchCV using 5-fold cross-validation
    print("Starting XGBoost analysis with SMOTE for 5-fold cross-validation...")
    flastXGBWithPipeline(outDir, flakyZip, largerNonFlakyZip, extractDir, n_splits=5)


Starting XGBoost analysis with SMOTE for 3-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 3 folds for each of 64 candidates, totalling 192 fits
Best Parameters with SMOTE: {'xgb__learning_rate': 0.3, 'xgb__max_depth': 5, 'xgb__n_estimators': 150}
Best F1 Score from cross-validation: 0.8956759715380406
Per-fold metrics saved to: smote-results/xgb-smote-fold-results-3-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.9777777777777779
Final Recall: 0.8319444444444445
Final Accuracy: 0.9700990099009901
Final F1 Score: 0.8956759715380406
Final MCC: 0.8845242548867366
XGBoost analysis completed for 3-folds with SMOTE. Results saved to: smote-results/xgb-results-3-folds.csv
Starting XGBoost analysis with SMOTE for 5-fold cross-validation...
Number of flaky documents: 47
Number of non-flaky documents: 254
Total number of documents: 301
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best

In [17]:
import os
import pandas as pd

# Function to extract the best results from the CSV files of each model
def extract_best_results(model_name, fold, csv_file):
    """
    Extracts the best result from the CSV file for a model.

    Parameters:
    - model_name: The name of the model (e.g., "XGBoost", "Random Forest", "Decision Tree")
    - fold: Number of folds (e.g., 5 or 3)
    - csv_file: The path to the CSV file containing the model's results

    Returns:
    A dictionary containing the best results for the model and fold.
    """
    if not os.path.exists(csv_file):
        print(f"CSV file for {model_name} ({fold}-fold) does not exist: {csv_file}")
        return None

    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Get the row with the best F1 score
    best_row = df.loc[df['f1'].idxmax()]

    # Collect the best results into a dictionary
    best_results = {
        'Model': model_name,
        'Fold': f"{fold}-fold",
        'Best Accuracy': best_row['accuracy'],
        'Best Precision': best_row['precision'],
        'Best Recall': best_row['recall'],
        'Best F1 Score': best_row['f1'],
        'Best MCC': best_row['mcc'],
        'Best Parameters': best_row.to_dict()  # Including all parameters
    }

    return best_results

# Function to gather and print/save the best results from all SMOTE models
def gather_best_results(models_results_dir, output_file):
    """
    Gathers the best results from all SMOTE models and writes them to a CSV file.

    Parameters:
    - models_results_dir: Directory where the model result CSV files are stored for the SMOTE models.
    - output_file: Path to the output CSV file to store the best results.
    """
    # List of models and their corresponding result files for both 5-fold and 3-fold
    models = {
        'KNN': {'5-fold': 'knn-smote-fold-results-5-folds.csv', '3-fold': 'knn-smote-fold-results-3-folds.csv'},
        'SVM': {'5-fold': 'svm-smote-fold-results-5-folds.csv', '3-fold': 'svm-smote-fold-results-5-folds.csv'},
        'Naive Bayes': {'5-fold': 'nb-smote-fold-results-5-folds.csv', '3-fold': 'nb-smote-fold-results-3-folds.csv'},
        'XGBoost': {'5-fold': 'xgb-smote-fold-results-5-folds.csv', '3-fold': 'xgb-smote-fold-results-3-folds.csv'},
        'Random Forest': {'5-fold': 'rf-smote-fold-results-5-folds.csv', '3-fold': 'rf-smote-fold-results-3-folds.csv'},
        'Decision Tree': {'5-fold': 'dt-smote-fold-results-5-folds.csv', '3-fold': 'dt-smote-fold-results-3-folds.csv'}
    }

    # Initialize an empty list to store the best results from each model and fold
    best_results = []

    # Iterate over each model and its result files for both 5-fold and 3-fold
    for model_name, folds in models.items():
        for fold, csv_file in folds.items():
            full_csv_path = os.path.join(models_results_dir, csv_file)
            best_result = extract_best_results(model_name, fold, full_csv_path)
            if best_result:
                best_results.append(best_result)

    # Convert the list of best results into a DataFrame
    best_results_df = pd.DataFrame(best_results)

    # Save the best results to the output CSV file
    best_results_df.to_csv(output_file, index=False)
    print(f"Best results saved to: {output_file}")

    # Print the best results as a table
    print(f"\nBest Results from All SMOTE Models")
    print(best_results_df.to_string(index=False))

# Example usage
if __name__ == "__main__":
    # Directories where the model result CSV files are stored
    results_dir = '/content/smote-results'

    # Path to the output CSV file where best results will be stored
    output_file = "best_results_smote.csv"

    # Gather and save the best results
    gather_best_results(results_dir, output_file)


Best results saved to: best_results_smote.csv

Best Results from All SMOTE Models
        Model        Fold  Best Accuracy  Best Precision  Best Recall  Best F1 Score  Best MCC                                                                                                                                                                                                                                                                                         Best Parameters
          KNN 5-fold-fold       0.823934        0.468036     0.746667       0.566845  0.492523                                                  {'n_neighbors': 3, 'weights': 'distance', 'metric': 'cosine', 'accuracy': 0.8239344262295081, 'precision': 0.4680361305361306, 'recall': 0.7466666666666667, 'f1': 0.566845388365725, 'mcc': 0.4925226809183992, 'preparationTime': 0.000352934375414}
          KNN 3-fold-fold       0.857063        0.531808     0.704167       0.604177  0.528114                                          