In [36]:
import os
import time
import zipfile
import numpy as np
from sklearn.model_selection import GridSearchCV


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef

from sklearn.model_selection import StratifiedKFold

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Performs vectorization using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# Data Extraction and Vectorization

# Parameters setup
flakyZip = "compressedDataset/flaky_files.zip"
nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

# Create directories
outDirEqual = "results/equal_flaky_nonflaky/"
outDirLarger = "results/larger_nonflaky/"
os.makedirs(outDirEqual, exist_ok=True)
os.makedirs(outDirLarger, exist_ok=True)

extractDirEqual = "extracted/equal_flaky_nonflaky/"
extractDirLarger = "extracted/larger_nonflaky/"
os.makedirs(extractDirEqual, exist_ok=True)
os.makedirs(extractDirLarger, exist_ok=True)

# Extract and read data once for equal combination
flakyDirEqual = os.path.join(extractDirEqual, 'flaky')
nonFlakyDirEqual = os.path.join(extractDirEqual, 'nonFlaky')
os.makedirs(flakyDirEqual, exist_ok=True)
os.makedirs(nonFlakyDirEqual, exist_ok=True)

extract_zip(flakyZip, flakyDirEqual)
extract_zip(nonFlakyZip, nonFlakyDirEqual)

dataPointsFlakyEqual = getDataPoints(flakyDirEqual)
dataPointsNonFlakyEqual = getDataPoints(nonFlakyDirEqual)
dataPointsEqual = dataPointsFlakyEqual + dataPointsNonFlakyEqual

# Print the number of datasets for equal combination
print(f"Number of flaky documents (equal combination): {len(dataPointsFlakyEqual)}")
print(f"Number of non-flaky documents (equal combination): {len(dataPointsNonFlakyEqual)}")
print(f"Total number of documents (equal combination): {len(dataPointsEqual)}")

dataLabelsListEqual = np.array([1]*len(dataPointsFlakyEqual) + [0]*len(dataPointsNonFlakyEqual))

# Vectorize data once
Z_equal = flastVectorization(dataPointsEqual)

# Extract and read data once for larger non-flaky combination
flakyDirLarger = os.path.join(extractDirLarger, 'flaky')
nonFlakyDirLarger = os.path.join(extractDirLarger, 'nonFlaky')
os.makedirs(flakyDirLarger, exist_ok=True)
os.makedirs(nonFlakyDirLarger, exist_ok=True)

extract_zip(flakyZip, flakyDirLarger)
extract_zip(largerNonFlakyZip, nonFlakyDirLarger)

dataPointsFlakyLarger = getDataPoints(flakyDirLarger)
dataPointsNonFlakyLarger = getDataPoints(nonFlakyDirLarger)
dataPointsLarger = dataPointsFlakyLarger + dataPointsNonFlakyLarger

# Print the number of datasets for larger combination
print(f"Number of flaky documents (larger combination): {len(dataPointsFlakyLarger)}")
print(f"Number of non-flaky documents (larger combination): {len(dataPointsNonFlakyLarger)}")
print(f"Total number of documents (larger combination): {len(dataPointsLarger)}")

dataLabelsListLarger = np.array([1]*len(dataPointsFlakyLarger) + [0]*len(dataPointsNonFlakyLarger))

Z_larger = flastVectorization(dataPointsLarger)


Number of flaky documents (equal combination): 45
Number of non-flaky documents (equal combination): 45
Total number of documents (equal combination): 90
Number of flaky documents (larger combination): 45
Number of non-flaky documents (larger combination): 254
Total number of documents (larger combination): 299


## KNN ##

In [37]:
from sklearn.neighbors import KNeighborsClassifier

def runKNN(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    print("************SAHPE od DATA:", Z.shape)
 
    
    # Define the KNN model
    knn = KNeighborsClassifier()

    # Parameter grid for hyperparameter tuning
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11, 15, 20],
        'metric': ['cosine', 'euclidean'],  # Distance metrics
        'weights': ['uniform', 'distance'],  # Neighbor weighting schemes
    }

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),  
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)

    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-knn-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_neighbors,metric,weights,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]

            
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['n_neighbors']},{param['metric']},{param['weights']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run KNN on equal combination
print("\nStarting KNN analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_knn_equal, best_score_5folds_knn_equal = runKNN(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
best_params_3folds_knn_equal, best_score_3folds_knn_equal = runKNN(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

# Display results
print("\nBest results for KNN 5-fold on equal combination:")
print(f"Best Parameters: {best_params_5folds_knn_equal}")
print(f"Best F1 Score: {best_score_5folds_knn_equal}")

print("\nBest results for KNN 3-fold on equal combination:")
print(f"Best Parameters: {best_params_3folds_knn_equal}")
print(f"Best F1 Score: {best_score_3folds_knn_equal}")

# Run KNN on larger non-flaky combination
print("\nStarting KNN analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_knn_larger, best_score_5folds_knn_larger = runKNN(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
best_params_3folds_knn_larger, best_score_3folds_knn_larger = runKNN(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

# Display results
print("\nBest results for KNN 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_knn_larger}")
print(f"Best F1 Score: {best_score_5folds_knn_larger}")

print("\nBest results for KNN 3-fold on larger combination:")
print(f"Best Parameters: {best_params_3folds_knn_larger}")
print(f"Best F1 Score: {best_score_3folds_knn_larger}")



Starting KNN analysis for flaky vs smaller non-flaky files (equal combination)...
************SAHPE od DATA: (90, 7563)
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Parameters: {'metric': 'cosine', 'n_neighbors': 15, 'weights': 'distance'}
Best F1 Score: 0.7894117647058824
KNN analysis completed for 5-folds. Results saved to: equal-params-knn-5-folds.csv
************SAHPE od DATA: (90, 7563)
Fitting 3 folds for each of 28 candidates, totalling 84 fits
Best Parameters: {'metric': 'cosine', 'n_neighbors': 5, 'weights': 'distance'}
Best F1 Score: 0.7699905033238367
KNN analysis completed for 3-folds. Results saved to: equal-params-knn-3-folds.csv

Best results for KNN 5-fold on equal combination:
Best Parameters: {'metric': 'cosine', 'n_neighbors': 15, 'weights': 'distance'}
Best F1 Score: 0.7894117647058824

Best results for KNN 3-fold on equal combination:
Best Parameters: {'metric': 'cosine', 'n_neighbors': 5, 'weights': 'distance'}
Best F1 Score: 0.7699905033238

## SVM

## SVM

In [38]:
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, matthews_corrcoef


def runSVM(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()    
    print("************SAHPE od DATA:", Z.shape)

    # Define the SVM model
    svm = SVC()

    # Parameter grid for hyperparameter tuning
    param_grid = {
        'C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # Kernel types
    }

    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)

    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(svm, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    outFile = f"{combination_label}-params-svm-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_neighbors,metric,weights,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]

            
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['C']},{param['kernel']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    return best_params, best_score

# Run SVM on equal combination
print("\nStarting SVM analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_svm_equal, best_score_5folds_svm_equal = runSVM(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
best_params_3folds_svm_equal, best_score_3folds_svm_equal = runSVM(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

# Display results
print("\nBest results for SVM 5-fold on equal combination:")
print(f"Best Parameters: {best_params_5folds_svm_equal}")
print(f"Best F1 Score: {best_score_5folds_svm_equal}")

print("\nBest results for SVM 3-fold on equal combination:")
print(f"Best Parameters: {best_params_3folds_svm_equal}")
print(f"Best F1 Score: {best_score_3folds_svm_equal}")

# Run SVM on larger non-flaky combination
print("\nStarting SVM analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_svm_larger, best_score_5folds_svm_larger = runSVM(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
best_params_3folds_svm_larger, best_score_3folds_svm_larger = runSVM(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

# Display results
print("\nBest results for SVM 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_svm_larger}")
print(f"Best F1 Score: {best_score_5folds_svm_larger}")

print("\nBest results for SVM 3-fold on larger combination:")
print(f"Best Parameters: {best_params_3folds_svm_larger}")
print(f"Best F1 Score: {best_score_3folds_svm_larger}")



Starting SVM analysis for flaky vs smaller non-flaky files (equal combination)...
************SAHPE od DATA: (90, 7563)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 100.0, 'kernel': 'rbf'}
Best F1 Score: 0.7834279325765394
************SAHPE od DATA: (90, 7563)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'C': 100.0, 'kernel': 'rbf'}
Best F1 Score: 0.7422003284072249

Best results for SVM 5-fold on equal combination:
Best Parameters: {'C': 100.0, 'kernel': 'rbf'}
Best F1 Score: 0.7834279325765394

Best results for SVM 3-fold on equal combination:
Best Parameters: {'C': 100.0, 'kernel': 'rbf'}
Best F1 Score: 0.7422003284072249

Starting SVM analysis for flaky vs larger non-flaky files (larger combination)...
************SAHPE od DATA: (299, 11986)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 0.01, 'kernel': 'linear'}
Best F1 Score: 0.5901648351648352
************SAHPE od DAT

## Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB


def runNB(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    print("************SAHPE od DATA:", Z.shape)

    # Define the NB model
    nb = MultinomialNB()

    # Parameter grid for hyperparameter tuning
    param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0], 
    }
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)

    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(nb, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-nb-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("alpha,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]

            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['alpha']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Naive Bayes analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run NB on equal combination
print("\nStarting NB analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_nb_equal, best_score_5folds_nb_equal = runNB(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
best_params_3folds_nb_equal, best_score_3folds_nb_equal = runNB(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

# Display results
print("\nBest results for NB 5-fold on equal combination:")
print(f"Best Parameters: {best_params_5folds_nb_equal}")
print(f"Best F1 Score: {best_score_5folds_nb_equal}")

print("\nBest results for NB 3-fold on equal combination:")
print(f"Best Parameters: {best_params_3folds_nb_equal}")
print(f"Best F1 Score: {best_score_3folds_nb_equal}")

# Run NB on larger non-flaky combination
print("\nStarting NB analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_nb_larger, best_score_5folds_nb_larger = runNB(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
best_params_3folds_nb_larger, best_score_3folds_nb_larger = runNB(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

# Display results
print("\nBest results for NB 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_nb_larger}")
print(f"Best F1 Score: {best_score_5folds_nb_larger}")

print("\nBest results for NB 3-fold on larger combination:")
print(f"Best Parameters: {best_params_3folds_nb_larger}")
print(f"Best F1 Score: {best_score_3folds_nb_larger}")




Starting NB analysis for flaky vs smaller non-flaky files (equal combination)...
************SAHPE od DATA: (90, 7563)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'alpha': 0.01}
Best F1 Score: 0.7342533936651584
Naive Bayes analysis completed for 5-folds. Results saved to: equal-params-nb-5-folds.csv
************SAHPE od DATA: (90, 7563)
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'alpha': 0.001}
Best F1 Score: 0.7428735632183908
Naive Bayes analysis completed for 3-folds. Results saved to: equal-params-nb-3-folds.csv

Best results for NB 5-fold on equal combination:
Best Parameters: {'alpha': 0.01}
Best F1 Score: 0.7342533936651584

Best results for NB 3-fold on equal combination:
Best Parameters: {'alpha': 0.001}
Best F1 Score: 0.7428735632183908

Starting NB analysis for flaky vs larger non-flaky files (larger combination)...
************SAHPE od DATA: (299, 11986)
Fitting 5 folds for each of 5 candidates, totallin

## XGB

In [41]:
from xgboost import XGBClassifier

def runXGB(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    print("************SAHPE od DATA:", Z.shape)

    # Define XGBoost model
    xgb_model = XGBClassifier(eval_metric="logloss")

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'eta': [0.01, 0.1, 0.3, 0.5],  # Learning rate
        'max_depth': [3, 5, 7, 10],    # Tree depth
        'n_estimators': [50, 100, 200, 300],  # Number of boosting rounds
    }

    # Custom scoring functions (without MCC)
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),  # Handle undefined precision
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),  # Handle undefined F1 score
        'mcc': make_scorer(matthews_corrcoef)

    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(
        xgb_model, param_grid, cv=skf, scoring=scoring,
        refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-xgb-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("eta,max_depth,n_estimators,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]

            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['eta']},{param['max_depth']},{param['n_estimators']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"XGBoost analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run XGBoost on equal combination
print("\nStarting XGBoost analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_xgb_equal, best_score_5folds_xgb_equal = runXGB(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
best_params_3folds_xgb_equal, best_score_3folds_xgb_equal = runXGB(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

# Display results
print("\nBest results for XGBoost 5-fold on equal combination:")
print(f"Best Parameters: {best_params_5folds_xgb_equal}")
print(f"Best F1 Score: {best_score_5folds_xgb_equal}")

print("\nBest results for XGBoost 3-fold on equal combination:")
print(f"Best Parameters: {best_params_3folds_xgb_equal}")
print(f"Best F1 Score: {best_score_3folds_xgb_equal}")

# Run XGBoost on larger non-flaky combination
print("\nStarting XGBoost analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_xgb_larger, best_score_5folds_xgb_larger = runXGB(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
best_params_3folds_xgb_larger, best_score_3folds_xgb_larger = runXGB(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

# Display results
print("\nBest results for XGBoost 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_xgb_larger}")
print(f"Best F1 Score: {best_score_5folds_xgb_larger}")

print("\nBest results for XGBoost 3-fold on larger combination:")
print(f"Best Parameters: {best_params_3folds_xgb_larger}")
print(f"Best F1 Score: {best_score_3folds_xgb_larger}")



Starting XGBoost analysis for flaky vs smaller non-flaky files (equal combination)...
************SAHPE od DATA: (90, 7563)
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Parameters: {'eta': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best F1 Score: 0.8720367094206104
XGBoost analysis completed for 5-folds. Results saved to: equal-params-xgb-5-folds.csv
************SAHPE od DATA: (90, 7563)
Fitting 3 folds for each of 64 candidates, totalling 192 fits
Best Parameters: {'eta': 0.01, 'max_depth': 3, 'n_estimators': 200}
Best F1 Score: 0.8780172413793105
XGBoost analysis completed for 3-folds. Results saved to: equal-params-xgb-3-folds.csv

Best results for XGBoost 5-fold on equal combination:
Best Parameters: {'eta': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best F1 Score: 0.8720367094206104

Best results for XGBoost 3-fold on equal combination:
Best Parameters: {'eta': 0.01, 'max_depth': 3, 'n_estimators': 200}
Best F1 Score: 0.8780172413793105

Starting XGBoost analy

## Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier

def runRF(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    print("************SAHPE od DATA:", Z.shape)

    # Define the Random Forest model
    rf_model = RandomForestClassifier()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200, 300],  # Number of trees in the forest
        'max_depth': [3, 5, 7, 10, None],  # Maximum depth of the tree
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
    }

    # Custom scoring functions (without MCC)
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)

    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(
        rf_model, param_grid, cv=skf, scoring=scoring,
        refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-rf-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_estimators,max_depth,min_samples_split,min_samples_leaf,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]

            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['n_estimators']},{param['max_depth']},{param['min_samples_split']},{param['min_samples_leaf']},{param['criterion']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run Random Forest on equal combination
print("\nStarting Random Forest analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_rf_equal, best_score_5folds_rf_equal = runRF(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
best_params_3folds_rf_equal, best_score_3folds_rf_equal = runRF(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

# Display results
print("\nBest results for Random Forest 5-fold on equal combination:")
print(f"Best Parameters: {best_params_5folds_rf_equal}")
print(f"Best F1 Score: {best_score_5folds_rf_equal}")

print("\nBest results for Random Forest 3-fold on equal combination:")
print(f"Best Parameters: {best_params_3folds_rf_equal}")
print(f"Best F1 Score: {best_score_3folds_rf_equal}")

# Run Random Forest on larger non-flaky combination
print("\nStarting Random Forest analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_rf_larger, best_score_5folds_rf_larger = runRF(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
best_params_3folds_rf_larger, best_score_3folds_rf_larger = runRF(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

# Display results
print("\nBest results for Random Forest 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_rf_larger}")
print(f"Best F1 Score: {best_score_5folds_rf_larger}")

print("\nBest results for Random Forest 3-fold on larger combination:")
print(f"Best Parameters: {best_params_3folds_rf_larger}")
print(f"Best F1 Score: {best_score_3folds_rf_larger}")



Starting Random Forest analysis for flaky vs smaller non-flaky files (equal combination)...
************SAHPE od DATA: (90, 7563)
Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}
Best F1 Score: 0.8860484544695071


  _data = np.array(data, dtype=dtype, copy=copy,


KeyError: 'criterion'

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

def runDT(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    print("************SAHPE od DATA:", Z.shape)

    # Define the Decision Tree model
    dt_model = DecisionTreeClassifier()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
        'max_depth': [None, 10, 30, 50, 100, 300, 500],  # Maximum depth of each tree
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 5, 10],  # Minimum number of samples required to be at a leaf node
        'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    }

    # Custom scoring functions (without MCC)
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),  
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': make_scorer(matthews_corrcoef)

    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with multiple scoring metrics
    grid_search = GridSearchCV(
        dt_model, param_grid, cv=skf, scoring=scoring,
        refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-dt-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("criterion,max_depth,min_samples_split,min_samples_leaf,max_features,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]

            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['criterion']},{param['max_depth']},{param['min_samples_split']},{param['min_samples_leaf']},{param['max_features']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run Decision Tree on equal combination
print("\nStarting Decision Tree analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_dt_equal, best_score_5folds_dt_equal = runDT(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
best_params_3folds_dt_equal, best_score_3folds_dt_equal = runDT(Z_equal, dataLabelsListEqual, outDirEqual, 3, "equal")

# Display results
print("\nBest results for Decision Tree 5-fold on equal combination:")
print(f"Best Parameters: {best_params_5folds_dt_equal}")
print(f"Best F1 Score: {best_score_5folds_dt_equal}")

print("\nBest results for Decision Tree 3-fold on equal combination:")
print(f"Best Parameters: {best_params_3folds_dt_equal}")
print(f"Best F1 Score: {best_score_3folds_dt_equal}")

# Run Decision Tree on larger non-flaky combination
print("\nStarting Decision Tree analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_dt_larger, best_score_5folds_dt_larger = runDT(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
best_params_3folds_dt_larger, best_score_3folds_dt_larger = runDT(Z_larger, dataLabelsListLarger, outDirLarger, 3, "larger")

# Display results
print("\nBest results for Decision Tree 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_dt_larger}")
print(f"Best F1 Score: {best_score_5folds_dt_larger}")

print("\nBest results for Decision Tree 3-fold on larger combination:")
print(f"Best Parameters: {best_params_3folds_dt_larger}")
print(f"Best F1 Score: {best_score_3folds_dt_larger}")
