In [3]:
import os
import time
import zipfile
import numpy as np
import math


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score,matthews_corrcoef, make_scorer
from sklearn.model_selection import StratifiedKFold



from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Performs vectorization using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z



###############################################################################
# Data Extraction and Vectorization

# Parameters setup
flakyZip = "compressedDataset/flaky_files.zip"
nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

# Create directories
outDirEqual = "results/equal_flaky_nonflaky/"
outDirLarger = "results/larger_nonflaky/"
os.makedirs(outDirEqual, exist_ok=True)
os.makedirs(outDirLarger, exist_ok=True)

extractDirEqual = "extracted/equal_flaky_nonflaky/"
extractDirLarger = "extracted/larger_nonflaky/"
os.makedirs(extractDirEqual, exist_ok=True)
os.makedirs(extractDirLarger, exist_ok=True)

# Extract and read data once for equal combination
flakyDirEqual = os.path.join(extractDirEqual, 'flaky')
nonFlakyDirEqual = os.path.join(extractDirEqual, 'nonFlaky')
os.makedirs(flakyDirEqual, exist_ok=True)
os.makedirs(nonFlakyDirEqual, exist_ok=True)

extract_zip(flakyZip, flakyDirEqual)
extract_zip(nonFlakyZip, nonFlakyDirEqual)

dataPointsFlakyEqual = getDataPoints(flakyDirEqual)
dataPointsNonFlakyEqual = getDataPoints(nonFlakyDirEqual)
dataPointsEqual = dataPointsFlakyEqual + dataPointsNonFlakyEqual

# Print the number of datasets for equal combination
print(f"Number of flaky documents (equal combination): {len(dataPointsFlakyEqual)}")
print(f"Number of non-flaky documents (equal combination): {len(dataPointsNonFlakyEqual)}")
print(f"Total number of documents (equal combination): {len(dataPointsEqual)}")

dataLabelsListEqual = np.array([1]*len(dataPointsFlakyEqual) + [0]*len(dataPointsNonFlakyEqual))

# Vectorize data once
Z_equal = flastVectorization(dataPointsEqual)

print("************SAHPE od DATA:", Z_equal.shape)
### After the split, PCA should happen
### MCC -> make sure way score weighted

# Extract and read data once for larger non-flaky combination
flakyDirLarger = os.path.join(extractDirLarger, 'flaky')
nonFlakyDirLarger = os.path.join(extractDirLarger, 'nonFlaky')
os.makedirs(flakyDirLarger, exist_ok=True)
os.makedirs(nonFlakyDirLarger, exist_ok=True)

extract_zip(flakyZip, flakyDirLarger)
extract_zip(largerNonFlakyZip, nonFlakyDirLarger)

dataPointsFlakyLarger = getDataPoints(flakyDirLarger)
dataPointsNonFlakyLarger = getDataPoints(nonFlakyDirLarger)
dataPointsLarger = dataPointsFlakyLarger + dataPointsNonFlakyLarger

# Print the number of datasets for larger combination
print(f"Number of flaky documents (larger combination): {len(dataPointsFlakyLarger)}")
print(f"Number of non-flaky documents (larger combination): {len(dataPointsNonFlakyLarger)}")
print(f"Total number of documents (larger combination): {len(dataPointsLarger)}")

dataLabelsListLarger = np.array([1]*len(dataPointsFlakyLarger) + [0]*len(dataPointsNonFlakyLarger))

Z_larger = flastVectorization(dataPointsLarger)


Number of flaky documents (equal combination): 44
Number of non-flaky documents (equal combination): 43
Total number of documents (equal combination): 87
************SAHPE od DATA: (87, 7506)
Number of flaky documents (larger combination): 44
Number of non-flaky documents (larger combination): 254
Total number of documents (larger combination): 298


## KNN ##

In [6]:
from sklearn.neighbors import KNeighborsClassifier



### training performance -> 
### return performance for trainning?

def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runKNN(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    
    # Define the pipeline with PCA and KNN
    pipeline = Pipeline([
        ('pca', PCA()),  
        ('knn', KNeighborsClassifier())
    ])
    
    
    dataset_length = Z.shape[0]
    
    print ('Data length',dataset_length)
    print([math.floor(i*0.08*dataset_length) for i in range(7,10)])
    
    
    # Parameter grid for hyperparameter tuning
    if combination_label =="equal":
         param_grid = {
        
            'pca__n_components': [50,60,65],  # Variance ratios
            'knn__n_neighbors': [3, 5, 7, 9],
            'knn__metric': ['cosine', 'euclidean'],
            'knn__weights': ['uniform', 'distance'],
            }
        
    else:
    
        param_grid = {
        
            'pca__n_components': [180,200,220],  # Variance ratios
            'knn__n_neighbors': [3, 5, 7, 9],
            'knn__metric': ['cosine', 'euclidean'],
            'knn__weights': ['uniform', 'distance'],
            }
    
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),  
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer, #MCC score custom function
    }
    print(type(Z))

    
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=skf, 
        scoring=scoring, 
        refit='f1', 
        verbose=1, 
        return_train_score=True
    )
    
    # Fit the GridSearchCV on data
    grid_search.fit(Z, dataLabelsList)
    
    
    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")
    
    # Save the results
    outFile = f"{combination_label}-params-knn-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("pca_n_components,n_neighbors,metric,weights,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['pca__n_components']},{param['knn__n_neighbors']},{param['knn__metric']},{param['knn__weights']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")
    
    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run KNN on equal combination
print("\nStarting KNN analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_knn_equal, best_score_5folds_knn_equal = runKNN(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")

# Display results
#print("\nBest results for KNN 5-fold on equal combination:")
#print(f"Best Parameters: {best_params_5folds_knn_equal}")
#print(f"Best F1 Score: {best_score_5folds_knn_equal}")



# Run KNN on larger non-flaky combination
print("\nStarting KNN analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_knn_larger, best_score_5folds_knn_larger = runKNN(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")

# Display results
print("\nBest results for KNN 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_knn_larger}")
print(f"Best F1 Score: {best_score_5folds_knn_larger}")




Starting KNN analysis for flaky vs smaller non-flaky files (equal combination)...
Data length 87
[48, 55, 62]
<class 'scipy.sparse._csr.csr_matrix'>
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'knn__metric': 'cosine', 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'pca__n_components': 50}
Best F1 Score: 0.7242533936651584
KNN analysis completed for 5-folds. Results saved to: equal-params-knn-5-folds.csv

Starting KNN analysis for flaky vs larger non-flaky files (larger combination)...
Data length 298
[166, 190, 214]
<class 'scipy.sparse._csr.csr_matrix'>
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'knn__metric': 'cosine', 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'pca__n_components': 180}
Best F1 Score: 0.5956043956043956
KNN analysis completed for 5-folds. Results saved to: larger-params-knn-5-folds.csv

Best results for KNN 5-fold on larger combination:
Best Parameters: {'knn__metric': 'cosine', 'kn

## SVM

In [10]:
from sklearn.svm import SVC

def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runSVM(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    
    # Define the pipeline with PCA and SVM
    pipeline = Pipeline([
        ('pca', PCA()),
        ('svm', SVC())
    ])
    
    dataset_length = Z.shape[0]
    print('Data length', dataset_length)
    
    if combination_label =="equal":
         param_grid = {
        
            'pca__n_components': [50,60,65],  # Variance ratios
            'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
            'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel types}
         }
             
    else:
        # Parameter grid for hyperparameter tuning
        param_grid = {
            'pca__n_components': [180,200,220],
            'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
            'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel types
        }
    
    # Custom scoring functions including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,  # MCC score custom function
    }
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )
    
    # Fit the GridSearchCV on data
    grid_search.fit(Z, dataLabelsList)
    
    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")
    
    # Save the results
    outFile = f"{combination_label}-params-svm-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("pca_n_components,C,kernel,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            fo.write(f"{param['pca__n_components']},{param['svm__C']},{param['svm__kernel']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")
    
    print(f"SVM analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run SVM on equal combination
print("\nStarting SVM analysis for flaky vs smaller non-flaky files (equal combination)...")
best_params_5folds_svm_equal, best_score_5folds_svm_equal = runSVM(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")

# Display results
#print("\nBest results for SVM 5-fold on equal combination:")
#print(f"Best Parameters: {best_params_5folds_svm_equal}")
#print(f"Best F1 Score: {best_score_5folds_svm_equal}")


# Run SVM on larger non-flaky combination
print("\nStarting SVM analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_svm_larger, best_score_5folds_svm_larger = runSVM(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")

# Display results
print("\nBest results for SVM 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_svm_larger}")
print(f"Best F1 Score: {best_score_5folds_svm_larger}")





Starting SVM analysis for flaky vs smaller non-flaky files (equal combination)...
Data length 87
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'pca__n_components': 65, 'svm__C': 0.1, 'svm__kernel': 'linear'}
Best F1 Score: 0.8446324143692564
SVM analysis completed for 5-folds. Results saved to: equal-params-svm-5-folds.csv

Starting SVM analysis for flaky vs larger non-flaky files (larger combination)...
Data length 298
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'pca__n_components': 220, 'svm__C': 0.1, 'svm__kernel': 'linear'}
Best F1 Score: 0.7073099415204678
SVM analysis completed for 5-folds. Results saved to: larger-params-svm-5-folds.csv

Best results for SVM 5-fold on larger combination:
Best Parameters: {'pca__n_components': 220, 'svm__C': 0.1, 'svm__kernel': 'linear'}
Best F1 Score: 0.7073099415204678


## XGB

In [19]:
from xgboost import XGBClassifier


def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runXGB(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Define the pipeline with PCA and XGBoost
    pipeline = Pipeline([
        ('pca', PCA()),
        ('xgb', XGBClassifier(eval_metric="logloss", use_label_encoder=False))
    ])

    dataset_length = Z.shape[0]
    print('Data length', dataset_length)
    print([math.floor(i * 0.08 * dataset_length) for i in range(7, 10)])

    # Parameter grid for hyperparameter tuning
        
    '''
       if combination_label =="equal":
             param_grid = {
        
                'pca__n_components': [50,60,65],  # Variance ratios
                'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],  # Learning rate
                'xgb__max_depth': [3, 5, 7, 10],              # Tree depth
                'xgb__n_estimators': [50, 100, 200],     # Number of boosting rounds
            }

    else:
        '''
    param_grid = {
           # 'pca__n_components': [180,200,220],
            'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],  # Learning rate
            'xgb__max_depth': [3, 5, 7, 10],              # Tree depth
            'xgb__n_estimators': [50, 100, 200],     # Number of boosting rounds
        }

    
    
    # Scoring metrics including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,  # MCC score custom function
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-xgb-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("pca_n_components,learning_rate,max_depth,n_estimators,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            fo.write(f"{param['xgb__learning_rate']},{param['xgb__max_depth']},{param['xgb__n_estimators']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"XGBoost analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score



# Run XGBoost on larger non-flaky combination
print("\nStarting XGBoost analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_xgb_larger, best_score_5folds_xgb_larger = runXGB(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")
best_params_5folds_xgb_equal, best_score_5folds_xgb_equal = runXGB(Z_equal, dataLabelsListLarger, outDirLarger, 5, "equal")

# Display results
print("\nBest results for XGBoost 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_xgb_larger}")
print(f"Best F1 Score: {best_score_5folds_xgb_larger}")




Starting XGBoost analysis for flaky vs larger non-flaky files (larger combination)...
Data length 298
[166, 190, 214]
Fitting 5 folds for each of 48 candidates, totalling 240 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'xgb__learning_rate': 0.3, 'xgb__max_depth': 5, 'xgb__n_estimators': 100}
Best F1 Score: 0.48640248640248646
XGBoost analysis completed for 5-folds. Results saved to: larger-params-xgb-5-folds.csv
Data length 87
[48, 55, 62]


ValueError: Found input variables with inconsistent numbers of samples: [87, 298]

In [12]:
def flastVectorization(dataPoints, dim=0, eps=0.3):
    countVec = CountVectorizer()
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

## Random Forest

In [8]:

from sklearn.ensemble import RandomForestClassifier

def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runRF(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Define the pipeline with only Random Forest (PCA removed)
    pipeline = Pipeline([
        ('rf', RandomForestClassifier())
    ])

    dataset_length = Z.shape[0]
    print('Data length', dataset_length)

    # Parameter grid for hyperparameter tuning (PCA parameters removed)
    param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [10, 20, 30],
    'rf__min_samples_split': [5, 10],
    'rf__min_samples_leaf': [2, 5],
    'rf__criterion': ["gini", "entropy"],
}
    # Custom scoring functions including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer  # Use the custom mcc_scorer function
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-rf-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_estimators,max_depth,min_samples_split,min_samples_leaf,criterion,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            fo.write(f"{param['rf__n_estimators']},{param['rf__max_depth']},{param['rf__min_samples_split']},{param['rf__min_samples_leaf']},{param['rf__criterion']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run Random Forest on larger non-flaky combination
print("\nStarting Random Forest analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_rf_equal, best_score_5folds_rf_equal = runRF(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")
best_params_5folds_rf_larger, best_score_5folds_rf_larger = runRF(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")

# Display results
print("\nBest results for Random Forest 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_rf_larger}")
print(f"Best F1 Score: {best_score_5folds_rf_larger}")



Starting Random Forest analysis for flaky vs larger non-flaky files (larger combination)...
Data length 87
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_leaf': 5, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}
Best F1 Score: 0.8711111111111111
Random Forest analysis completed for 5-folds. Results saved to: equal-params-rf-5-folds.csv
Data length 298
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100}
Best F1 Score: 0.775183346065699
Random Forest analysis completed for 5-folds. Results saved to: larger-params-rf-5-folds.csv

Best results for Random Forest 5-fold on larger combination:
Best Parameters: {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 10

## Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runDT(Z, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Define the pipeline with only Decision Tree (PCA removed)
    pipeline = Pipeline([
        ('dt', DecisionTreeClassifier())
    ])

    dataset_length = Z.shape[0]
    print('Data length', dataset_length)

    # Parameter grid for hyperparameter tuning (PCA parameters removed)
    param_grid = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [10, 20, 30],
    'dt__min_samples_split': [5, 10],
    'dt__min_samples_leaf': [2, 5],
    'dt__max_features': [None, 'sqrt', 'log2'],  }

    # Scoring metrics including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,  # MCC score custom function
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(Z, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-dt-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("criterion,max_depth,min_samples_split,min_samples_leaf,max_features,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            fo.write(f"{param['dt__criterion']},{param['dt__max_depth']},{param['dt__min_samples_split']},{param['dt__min_samples_leaf']},{param['dt__max_features']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run Decision Tree on larger non-flaky combination
print("\nStarting Decision Tree analysis for flaky vs larger non-flaky files (larger combination)...")
best_params_5folds_dt_equal, best_score_5folds_dt_equal = runDT(Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal")

best_params_5folds_dt_larger, best_score_5folds_dt_larger = runDT(Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger")

# Display results
print("\nBest results for Decision Tree 5-fold on larger combination:")
print(f"Best Parameters: {best_params_5folds_dt_larger}")
print(f"Best F1 Score: {best_score_5folds_dt_larger}")



Starting Decision Tree analysis for flaky vs larger non-flaky files (larger combination)...
Data length 87
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 20, 'dt__max_features': None, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 10}
Best F1 Score: 0.8397058823529411
Decision Tree analysis completed for 5-folds. Results saved to: equal-params-dt-5-folds.csv
Data length 298
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 5}
Best F1 Score: 0.8517408906882592
Decision Tree analysis completed for 5-folds. Results saved to: larger-params-dt-5-folds.csv

Best results for Decision Tree 5-fold on larger combination:
Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 5, 'dt__min_samples_split'

In [None]:
import os
import pandas as pd

# Function to extract the best results from the CSV files of each model
def extract_best_results(model_name, combination, fold, csv_file):
    """
    Extracts the best result from the CSV file for a model.

    Parameters:
    - model_name: The name of the model (e.g., "KNN", "SVM")
    - combination: The combination of flaky and non-flaky files (e.g., "equal", "larger")
    - fold: Number of folds (e.g., "5-fold" or "3-fold")
    - csv_file: The path to the CSV file containing the model's results

    Returns:
    A dictionary containing the best results for the model, combination, and fold.
    """
    if not os.path.exists(csv_file):
        print(f"CSV file for {model_name} ({combination}, {fold}) does not exist: {csv_file}")
        return None

    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    if df.empty:
        print(f"CSV file for {model_name} ({combination}, {fold}) is empty: {csv_file}")
        return None
    
    # Get the row with the best F1 score
    best_row = df.loc[df['f1'].idxmax()]
    
    # Extract metrics
    accuracy = best_row['accuracy']
    precision = best_row['precision']
    recall = best_row['recall']
    f1 = best_row['f1']
    mcc = best_row.get('mcc', None)  # Get MCC if available
    
    # Collect parameters (exclude known metric columns)
    metric_columns = ['accuracy', 'precision', 'recall', 'f1', 'mcc', 'preparationTime']
    parameter_columns = [col for col in df.columns if col not in metric_columns]
    parameters = {col: best_row[col] for col in parameter_columns}
    
    # Create a combined model name (e.g., 'equal KNN' or 'not equal KNN')
    combination_label = 'not equal' if combination == 'larger' else 'equal'
    combined_model_name = f"{combination_label} {model_name}"
    
    # Collect the best results into a dictionary
    best_results = {
        'Model': combined_model_name,
        'Fold': fold,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'MCC': mcc,
        'Parameters': parameters
    }
    
    return best_results

# Function to gather and print/save the best results from all models and combinations
def gather_best_results(models_results_dirs, output_file):
    """
    Gathers the best results from all models for both combinations and writes them to a CSV file.

    Parameters:
    - models_results_dirs: Dictionary mapping combination names to their result directories.
    - output_file: Path to the output CSV file to store the best results.
    """
    # List of models and their corresponding result files for both 5-fold and 3-fold
    models = {
        'KNN': {'5-fold': 'params-knn-5-folds.csv', '3-fold': 'params-knn-3-folds.csv'},
        'SVM': {'5-fold': 'params-svm-5-folds.csv', '3-fold': 'params-svm-3-folds.csv'},
        'Naive Bayes': {'5-fold': 'params-nb-5-folds.csv', '3-fold': 'params-nb-3-folds.csv'},
        'XGBoost': {'5-fold': 'params-xgb-5-folds.csv', '3-fold': 'params-xgb-3-folds.csv'},
        'Random Forest': {'5-fold': 'params-rf-5-folds.csv', '3-fold': 'params-rf-3-folds.csv'},
        'Decision Tree': {'5-fold': 'params-dt-5-folds.csv', '3-fold': 'params-dt-3-folds.csv'}
    }

    # Initialize an empty list to store the best results from each model, combination, and fold
    best_results = []

    # Iterate over each model, fold, and combination
    for model_name, folds in models.items():
        for fold_label, csv_file in folds.items():
            for combination, results_dir in models_results_dirs.items():
                # Adjust the filename to include the combination prefix (e.g., equal-params-xgb-5-folds.csv)
                full_csv_file = f"{combination}-{csv_file}"
                full_csv_path = os.path.join(results_dir, full_csv_file)
                best_result = extract_best_results(model_name, combination, fold_label, full_csv_path)
                if best_result:
                    best_results.append(best_result)

    if not best_results:
        print(f"No best results found.")
        return

    # Convert the list of best results into a DataFrame
    best_results_df = pd.DataFrame(best_results)
    
    # Reorder columns for clarity
    columns = ['Model', 'Fold', 'Accuracy', 'Precision', 'Recall', 'F1', 'MCC', 'Parameters']
    best_results_df = best_results_df[columns]
    
    # Add sorting helper columns
    # Extract model name (e.g., 'KNN', 'SVM')
    best_results_df['Model_Name'] = best_results_df['Model'].apply(lambda x: x.split(' ', 1)[1])
    # Extract combination order (0 for 'equal', 1 for 'not equal')
    best_results_df['Combination_Order'] = best_results_df['Model'].apply(lambda x: 0 if 'equal' in x else 1)
    # Extract fold number (e.g., 5 or 3)
    best_results_df['Fold_Number'] = best_results_df['Fold'].apply(lambda x: int(x.split('-')[0]))
    
    # Sort the DataFrame
    best_results_df = best_results_df.sort_values(by=['Model_Name', 'Fold_Number', 'Combination_Order'])
    
    # Drop helper columns
    best_results_df = best_results_df.drop(columns=['Model_Name', 'Combination_Order', 'Fold_Number'])
    
    # Save the best results to the output CSV file
    best_results_df.to_csv(output_file, index=False)
    print(f"Combined best results saved to: {output_file}")
    
    # Print the best results as a table
    print("\nCombined Best Results from All Models:")
    print(best_results_df.to_string(index=False))

# Example usage
if __name__ == "__main__":
    # Directories where the model result CSV files are stored for each combination
    models_results_dirs = {
        'equal': 'results/equal_flaky_nonflaky/',
        'larger': 'results/larger_nonflaky/'  # 'larger' will be labeled as 'not equal' in output
    }

    # Path to the output CSV file where best results will be stored
    output_file = "combined_best_results.csv"

    # Gather and save the combined best results
    gather_best_results(models_results_dirs, output_file)

