In [3]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA
###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

  
###############################################################################
# Data Extraction and Vectorization

# Parameters setup
flakyZip = "Dataset/flaky_files.zip"
nonFlakyZip = "Dataset/reduced_nonflaky_files.zip"
imbalanceNonFlakyZip = "Dataset/nonflaky_files.zip"

# Create directories
outDirbalance = "results/balance_flaky_nonflaky/"
outDirimbalance = "results/imbalance_nonflaky/"
os.makedirs(outDirbalance, exist_ok=True)
os.makedirs(outDirimbalance, exist_ok=True)

extractDirbalance = "extracted/balance_flaky_nonflaky/"
extractDirimbalance = "extracted/imbalance_nonflaky/"
os.makedirs(extractDirbalance, exist_ok=True)
os.makedirs(extractDirimbalance, exist_ok=True)

# Extract and read data once for balance combination
flakyDirbalance = os.path.join(extractDirbalance, 'flaky')
nonFlakyDirbalance = os.path.join(extractDirbalance, 'nonFlaky')
os.makedirs(flakyDirbalance, exist_ok=True)
os.makedirs(nonFlakyDirbalance, exist_ok=True)

extract_zip(flakyZip, flakyDirbalance)
extract_zip(nonFlakyZip, nonFlakyDirbalance)

dataPointsFlakybalance = getDataPoints(flakyDirbalance)
dataPointsNonFlakybalance = getDataPoints(nonFlakyDirbalance)
dataPointsbalance = dataPointsFlakybalance + dataPointsNonFlakybalance

# Print the number of datasets for balance combination
print(f"Number of flaky documents (balance combination): {len(dataPointsFlakybalance)}")
print(f"Number of non-flaky documents (balance combination): {len(dataPointsNonFlakybalance)}")
print(f"Total number of documents (balance combination): {len(dataPointsbalance)}")

dataLabelsListbalance = np.array([1]*len(dataPointsFlakybalance) + [0]*len(dataPointsNonFlakybalance))

# Vectorize data once

# Extract and read data once for imbalance non-flaky combination
flakyDirimbalance = os.path.join(extractDirimbalance, 'flaky')
nonFlakyDirimbalance = os.path.join(extractDirimbalance, 'nonFlaky')
os.makedirs(flakyDirimbalance, exist_ok=True)
os.makedirs(nonFlakyDirimbalance, exist_ok=True)

extract_zip(flakyZip, flakyDirimbalance)
extract_zip(imbalanceNonFlakyZip, nonFlakyDirimbalance)

dataPointsFlakyimbalance = getDataPoints(flakyDirimbalance)
dataPointsNonFlakyimbalance = getDataPoints(nonFlakyDirimbalance)
dataPointsimbalance = dataPointsFlakyimbalance + dataPointsNonFlakyimbalance

# Print the number of datasets for imbalance combination
print(f"Number of flaky documents (imbalance combination): {len(dataPointsFlakyimbalance)}")
print(f"Number of non-flaky documents (imbalance combination): {len(dataPointsNonFlakyimbalance)}")
print(f"Total number of documents (imbalance combination): {len(dataPointsimbalance)}")

dataLabelsListimbalance = np.array([1]*len(dataPointsFlakyimbalance) + [0]*len(dataPointsNonFlakyimbalance))


Number of flaky documents (balance combination): 45
Number of non-flaky documents (balance combination): 45
Total number of documents (balance combination): 90
Number of flaky documents (imbalance combination): 45
Number of non-flaky documents (imbalance combination): 243
Total number of documents (imbalance combination): 288


## RF

In [None]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# Random Forest with SMOTE

def runRFWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()

    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline with SMOTE, and Random Forest
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('rf', RandomForestClassifier(random_state=42))    # Random Forest classifier
        ])
    # Define parameter grid for GridSearchCV
    param_grid = {
        'rf__n_estimators': [50, 100, 200],                 # Number of trees in the forest
        'rf__max_depth': [10, 20, 30],                     # Maximum depth of the tree
        'rf__min_samples_split': [5, 10],                   # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [2, 5],                    # Minimum number of samples required at a leaf node
        'rf__criterion': ['gini', 'entropy']               # Function to measure the quality of a split
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE and PCA: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({

            'n_estimators': results['params'][idx].get('rf__n_estimators'),
            'max_depth': results['params'][idx].get('rf__max_depth'),
            'min_samples_split': results['params'][idx].get('rf__min_samples_split'),
            'min_samples_leaf': results['params'][idx].get('rf__min_samples_leaf'),
            'criterion': results['params'][idx].get('rf__criterion'),
            'accuracy': results.get('mean_test_accuracy', [None])[idx],
            'precision': results.get('mean_test_precision', [None])[idx],
            'recall': results.get('mean_test_recall', [None])[idx],
            'f1': results.get('mean_test_f1', [None])[idx],
            'mcc': results.get('mean_test_mcc', [None])[idx],
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile_folds = os.path.join(outDir, f"rf-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile_folds, index=False)

    print(f"Per-fold metrics saved to: {outFile_folds}")

    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile_final = os.path.join(outDir, f"rf-results-{n_splits}-folds.csv")
    with open(outFile_final, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile_final}")

    return best_params, final_f1, final_mcc

###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "smote-results"
os.makedirs(outDir, exist_ok=True)

# Run Random Forest with SMOTE and PCA using 5-fold cross-validation
print("\nStarting Random Forest analysis with SMOTE for 5-fold cross-validation...")
best_params_5folds, best_f1_5folds, final_mcc_5folds = runRFWithSMOTE(dataPointsimbalance, dataLabelsListimbalance, outDir, n_splits=5)

# Display results
print("\nBest results for Random Forest with SMOTE 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting Random Forest analysis with SMOTE for 5-fold cross-validation...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters with SMOTE and PCA: {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 50}
Best F1 Score from cross-validation: 0.8661764705882353
Per-fold metrics saved to: smote-results\rf-smote-fold-results-5-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.9464285714285715
Final Recall: 0.7999999999999999
Final Accuracy: 0.9617664851784635
Final F1 Score: 0.8661764705882353
Final MCC: 0.8487345249535231
Random Forest analysis completed for 5-folds with SMOTE. Results saved to: smote-results\rf-results-5-folds.csv

Best results for Random Forest with SMOTE 5-fold cross-validation:
Best Parameters: {'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 50}
Best F1 Score: 0.8661764705882353
Final

## Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier 
def runDecisionTree(dataPoints, dataLabelsList, outDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # Define the pipeline with Vectorizer, Dimensionality Reduction, and Decision Tree
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ])

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'dt__criterion': ['gini', 'entropy'],
        'dt__max_depth': [10, 20, 30],
        'dt__min_samples_split': [2, 5, 10],
        'dt__min_samples_leaf': [1, 2, 5, 10],
        'dt__max_features': [None, 'sqrt', 'log2'],
    }

    # Custom scoring functions including MCC
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer  # Use the custom MCC scorer
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV with the pipeline
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=skf, 
        scoring=scoring, 
        refit='f1', 
        verbose=1, 
        return_train_score=True
    )

    # Fit the GridSearchCV on data
    grid_search.fit(dataPoints, dataLabelsList)

    # Get the best parameters and the best score for f1
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # Evaluate the model on the entire dataset
    y_pred = grid_search.predict(dataPoints)



    # Save the results
    outFile = f"{combination_label}-params-dt-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("criterion,max_depth,min_samples_split,min_samples_leaf,max_features,accuracy,precision,recall,f1,mcc,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            mcc = grid_search.cv_results_['mean_test_mcc'][idx]
            preparationTime = (time.perf_counter() - v0) / len(dataLabelsList)
            fo.write(f"{param['dt__criterion']},{param['dt__max_depth']},{param['dt__min_samples_split']},{param['dt__min_samples_leaf']},{param['dt__max_features']},{accuracy},{precision},{recall},{f1},{mcc},{preparationTime}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

# Run Decision Tree on balance combination
print("\nStarting Decision Tree analysis for flaky vs smaller non-flaky files (balance combination)...")
best_params_5folds_balance, best_score_5folds_balance = runDecisionTree(
    dataPointsbalance, dataLabelsListbalance, outDirbalance, 5, "balance"
)

print("\nBest results for Decision Tree 5-fold on balance combination:")
print(f"Best Parameters: {best_params_5folds_balance}")
print(f"Best F1 Score: {best_score_5folds_balance}")

# Run Decision Tree on imbalance non-flaky combination
print("\nStarting Decision Tree analysis for flaky vs imbalance non-flaky files (imbalance combination)...")
best_params_5folds_imbalance, best_score_5folds_imbalance = runDecisionTree(
    dataPointsimbalance, dataLabelsListimbalance, outDirimbalance, 5, "imbalance"
)

print("\nBest results for Decision Tree 5-fold on imbalance combination:")
print(f"Best Parameters: {best_params_5folds_imbalance}")
print(f"Best F1 Score: {best_score_5folds_imbalance}")


Starting Decision Tree analysis for flaky vs smaller non-flaky files (balance combination)...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 2}
Best F1 Score: 0.8829640947288006
Decision Tree analysis completed for 5-folds. Results saved to: balance-params-dt-5-folds.csv

Best results for Decision Tree 5-fold on balance combination:
Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 2}
Best F1 Score: 0.8829640947288006

Starting Decision Tree analysis for flaky vs imbalance non-flaky files (imbalance combination)...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 10}
Best F1 Score: 0.8765775401069519
Decision Tree analysis completed for 5-folds. Results saved to: imbalance-params-dt-5-folds.csv

Best results for Decision Tree 5-fold on imbalance combination:
Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 10}
Best F1 Score: 0.8765775401069519
