In [2]:
import os
import time
import zipfile
import numpy as np
import math


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score,matthews_corrcoef, make_scorer
from sklearn.model_selection import StratifiedKFold



from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList




###############################################################################
# Data Extraction and Vectorization

# Parameters setup
flakyZip = "Dataset/flaky_files.zip"
nonFlakyZip = "Dataset/reduced_nonflaky_files.zip"
imbalanceNonFlakyZip = "Dataset/imbalance_nonflaky_files.zip"

# Create directories
outDirbalance = "results/balance_flaky_nonflaky/"
outDirimbalance = "results/imbalance_nonflaky/"
os.makedirs(outDirbalance, exist_ok=True)
os.makedirs(outDirimbalance, exist_ok=True)

extractDirbalance = "extracted/balance_flaky_nonflaky/"
extractDirimbalance = "extracted/imbalance_nonflaky/"
os.makedirs(extractDirbalance, exist_ok=True)
os.makedirs(extractDirimbalance, exist_ok=True)

# Extract and read data once for balance combination
flakyDirbalance = os.path.join(extractDirbalance, 'flaky')
nonFlakyDirbalance = os.path.join(extractDirbalance, 'nonFlaky')
os.makedirs(flakyDirbalance, exist_ok=True)
os.makedirs(nonFlakyDirbalance, exist_ok=True)

extract_zip(flakyZip, flakyDirbalance)
extract_zip(nonFlakyZip, nonFlakyDirbalance)

dataPointsFlakybalance = getDataPoints(flakyDirbalance)
dataPointsNonFlakybalance = getDataPoints(nonFlakyDirbalance)
dataPointsbalance = dataPointsFlakybalance + dataPointsNonFlakybalance

# Print the number of datasets for balance combination
print(f"Number of flaky documents (balance combination): {len(dataPointsFlakybalance)}")
print(f"Number of non-flaky documents (balance combination): {len(dataPointsNonFlakybalance)}")
print(f"Total number of documents (balance combination): {len(dataPointsbalance)}")

dataLabelsListbalance = np.array([1]*len(dataPointsFlakybalance) + [0]*len(dataPointsNonFlakybalance))

# Vectorize data once

### After the split, PCA should happen
### MCC -> make sure way score weighted
def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)
# Extract and read data once for imbalance non-flaky combination
flakyDirimbalance = os.path.join(extractDirimbalance, 'flaky')
nonFlakyDirimbalance = os.path.join(extractDirimbalance, 'nonFlaky')
os.makedirs(flakyDirimbalance, exist_ok=True)
os.makedirs(nonFlakyDirimbalance, exist_ok=True)

extract_zip(flakyZip, flakyDirimbalance)
extract_zip(imbalanceNonFlakyZip, nonFlakyDirimbalance)

dataPointsFlakyimbalance = getDataPoints(flakyDirimbalance)
dataPointsNonFlakyimbalance = getDataPoints(nonFlakyDirimbalance)
dataPointsimbalance = dataPointsFlakyimbalance + dataPointsNonFlakyimbalance

# Print the number of datasets for imbalance combination
print(f"Number of flaky documents (imbalance combination): {len(dataPointsFlakyimbalance)}")
print(f"Number of non-flaky documents (imbalance combination): {len(dataPointsNonFlakyimbalance)}")
print(f"Total number of documents (imbalance combination): {len(dataPointsimbalance)}")


dataLabelsListimbalance = np.array([1]*len(dataPointsFlakyimbalance) + [0]*len(dataPointsNonFlakyimbalance))



Number of flaky documents (balance combination): 45
Number of non-flaky documents (balance combination): 45
Total number of documents (balance combination): 90
Number of flaky documents (imbalance combination): 45
Number of non-flaky documents (imbalance combination): 243
Total number of documents (imbalance combination): 288


## KNN ##

In [3]:
from sklearn.neighbors import KNeighborsClassifier


def runKNN(dataPoints, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    
    # Define the pipeline with Vectorizer, PCA, and KNN
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer to convert text data to numerical
        ('pca', PCA(random_state=42)),  
        ('knn', KNeighborsClassifier())
    ])
    
    dataset_length = len(dataPoints)
    
    print('Data length', dataset_length)
    print([math.floor(i * 0.08 * dataset_length) for i in range(7, 10)])
    
    # Parameter grid for hyperparameter tuning
    if combination_label == "balance":
        param_grid = {
            'pca__n_components': [50, 60, 65],
            'knn__n_neighbors': [3, 5, 7, 9],
            'knn__metric': ['cosine', 'euclidean'],
            'knn__weights': ['uniform', 'distance'],
        }
    else:
        param_grid = {
            'pca__n_components': [150, 180, 200, 220],
            'knn__n_neighbors': [3, 5, 7, 9],
            'knn__metric': ['cosine', 'euclidean'],
            'knn__weights': ['uniform', 'distance'],
        }
    
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,  # MCC score custom function
    }
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )
    
    # Fit the GridSearchCV on data
    grid_search.fit(dataPoints, dataLabelsList)
    
    # Rest of your code remains the same...

    
    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")
    
    # Save the results
    outFile = f"{combination_label}-params-knn-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("pca_n_components,n_neighbors,metric,weights,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)  
            fo.write(f"{param['pca__n_components']},{param['knn__n_neighbors']},{param['knn__metric']},{param['knn__weights']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")
    
    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run KNN on balance combination
print("\nStarting KNN analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_knn_balance, best_score_5folds_knn_balance = runKNN(dataPointsbalance, dataLabelsListbalance, outDirbalance, 5, "balance")


#Display results
print("\nBest results for KNN 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_knn_balance}")
print(f"Best F1 Score: {best_score_5folds_knn_balance}")


# Run KNN on imbalance non-flaky combination
print("\nStarting KNN analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_knn_imbalance, best_score_5folds_knn_imbalance = runKNN(dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5, "imbalance")

# Display results
print("\nBest results for KNN 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_knn_imbalance}")
print(f"Best F1 Score: {best_score_5folds_knn_imbalance}")




Starting KNN analysis for flaky vs smaller non-flaky files (balance combination)...
Data length 90
[50, 57, 64]
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 7, 'knn__weights': 'distance', 'pca__n_components': 50}
Best F1 Score: 0.6900653594771242
KNN analysis completed for 5-folds. Results saved to: balance-params-knn-5-folds.csv

Best results for KNN 5-fold on balance combination:
Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 7, 'knn__weights': 'distance', 'pca__n_components': 50}
Best F1 Score: 0.6900653594771242

Starting KNN analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Data length 288
[161, 184, 207]
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'pca__n_components': 150}
Best F1 Score: 0.4971028971028971
KNN analysis completed for 5-folds.

## SVM

In [4]:
from sklearn.svm import SVC

def runSVM(dataPoints, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()
    
    # Define the pipeline with Vectorizer, PCA, and SVM
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),
        ('pca', PCA(random_state=42)),
        ('svm', SVC(random_state=42))
    ])
    
    dataset_length = len(dataPoints)

    print('Data length', dataset_length)
    
    # Parameter grid for hyperparameter tuning
    if combination_label == "balance":
        param_grid = {
            'pca__n_components': [50, 60, 65],
            'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        }
    else:
        param_grid = {
            'pca__n_components': [150, 180, 200, 220],
            'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        }
    
    # Custom scoring functions including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,
    }
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )
    
    # Fit the GridSearchCV on data
    grid_search.fit(dataPoints, dataLabelsList)
    
    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")
    
    # Save the results
    outFile = f"{combination_label}-params-svm-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("pca_n_components,C,kernel,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            fo.write(f"{param['pca__n_components']},{param['svm__C']},{param['svm__kernel']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")
    
    print(f"SVM analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run SVM on balance combination
print("\nStarting SVM analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_svm_balance, best_score_5folds_svm_balance = runSVM(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5, "balance"
)

# Display results
print("\nBest results for SVM 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_svm_balance}")
print(f"Best F1 Score: {best_score_5folds_svm_balance}")

# Run SVM on imbalance non-flaky combination
print("\nStarting SVM analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_svm_imbalance, best_score_5folds_svm_imbalance = runSVM(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5, "imbalance"
)

# Display results
print("\nBest results for SVM 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_svm_imbalance}")
print(f"Best F1 Score: {best_score_5folds_svm_imbalance}")



Starting SVM analysis for flaky vs smaller non-flaky files (balance combination)...
Data length 90
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'pca__n_components': 65, 'svm__C': 100.0, 'svm__kernel': 'rbf'}
Best F1 Score: 0.8239766081871345
SVM analysis completed for 5-folds. Results saved to: balance-params-svm-5-folds.csv

Best results for SVM 5-fold on balance combination:
Best Parameters: {'pca__n_components': 65, 'svm__C': 100.0, 'svm__kernel': 'rbf'}
Best F1 Score: 0.8239766081871345

Starting SVM analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Data length 288
Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best Parameters: {'pca__n_components': 220, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best F1 Score: 0.6914602683178535
SVM analysis completed for 5-folds. Results saved to: imbalance-params-svm-5-folds.csv

Best results for SVM 5-fold on imbalance combination:
Best Parameters: {'pca__n_components': 2

## XGB

In [None]:
from xgboost import XGBClassifier
import time
import os
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

def mcc_scorer(estimator, X, y_true):
    """
    Custom scorer function for Matthews Correlation Coefficient.
    """
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

def runXGB(dataPoints, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    print(f"Starting runXGB with combination_label: {combination_label}")

    # Define the pipeline with Vectorizer, PCA, and XGBoost
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),
        ('pca', PCA(random_state=42)),
        ('xgb', XGBClassifier(eval_metric="logloss"))
    ])

    # Parameter grid for hyperparameter tuning
    if combination_label == "balance":
        param_grid = {
            'pca__n_components': [50, 60, 65],
            'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],
            'xgb__max_depth': [3, 5, 7, 10],
            'xgb__n_estimators': [50, 100, 200],
        }
    else:
        param_grid = {
            'pca__n_components': [150, 180, 200, 220],
            'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],
            'xgb__max_depth': [3, 5, 7, 10],
            'xgb__n_estimators': [50, 100, 200],
        }

    print(f"Parameter grid for {combination_label}: {param_grid}")

    # Scoring metrics including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer,  # MCC score custom function
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=skf,
        scoring=scoring,
        refit='f1',
        verbose=1,
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(dataPoints, dataLabelsList)

    # Get the best parameters and the best score for F1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters for {combination_label}: {best_params}")
    print(f"Best F1 Score for {combination_label}: {best_score}")

    # Save the results
    outFile = f"{combination_label}-params-xgb-{n_splits}-folds.csv"
    output_path = os.path.join(outDir, outFile)
    print(f"Saving results to: {output_path}")

    with open(output_path, "w") as fo:
        fo.write("pca_n_components,learning_rate,max_depth,n_estimators,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            fo.write(f"{param['pca__n_components']},{param['xgb__learning_rate']},{param['xgb__max_depth']},{param['xgb__n_estimators']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"XGBoost analysis completed for {n_splits}-folds on {combination_label} data. Results saved to: {outFile}")
    return best_params, best_score
# Usage of the function:

# Run XGBoost on imbalance non-flaky combination
print("\nStarting XGBoost analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_xgb_imbalance, best_score_5folds_xgb_imbalance = runXGB(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5, "imbalance"
)

# Display results for imbalance combination
print("\nBest results for XGBoost 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_xgb_imbalance}")
print(f"Best F1 Score: {best_score_5folds_xgb_imbalance}")

# Run XGBoost on balance combination
print("\nStarting XGBoost analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_xgb_balance, best_score_5folds_xgb_balance = runXGB(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5, "balance"
)

# Display results for balance combination
print("\nBest results for XGBoost 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_xgb_balance}")
print(f"Best F1 Score: {best_score_5folds_xgb_balance}")



Starting XGBoost analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Starting runXGB with combination_label: imbalance
Parameter grid for imbalance: {'pca__n_components': [150, 180, 200, 220], 'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5], 'xgb__max_depth': [3, 5, 7, 10], 'xgb__n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 192 candidates, totalling 960 fits
