In [1]:
import os
import time
import zipfile
import numpy as np
import random

from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import os
import time
import math



# Set global random seeds
random.seed(42)
np.random.seed(42)

# Set environment variables for deterministic behavior
os.environ['PYTHONHASHSEED'] = '42'
os.environ['OMP_NUM_THREADS'] = '1'  # For OpenMP
os.environ['MKL_NUM_THREADS'] = '1'  # For MKL
os.environ['OPENBLAS_NUM_THREADS'] = '1'  # For OpenBLAS
os.environ['NUMEXPR_NUM_THREADS'] = '1'  # For NumExpr
#os.environ['VECLIB_MAXIMUM_THREADS'] = '1'  # For macOS Accelerate

from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        dirs.sort()  # Sort directories to ensure consistent traversal
        files.sort()  # Sort files to ensure consistent order
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

###############################################################################
# Data Extraction

# Parameters setup
flakyZip = "Dataset/flaky_files.zip"
nonFlakyZip = "Dataset/nonflaky_files.zip"
extractDir = "smote-extracted"
os.makedirs(extractDir, exist_ok=True)

# Extract the zip files
flakyDir = os.path.join(extractDir, 'flaky')
nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
os.makedirs(flakyDir, exist_ok=True)
os.makedirs(nonFlakyDir, exist_ok=True)

extract_zip(flakyZip, flakyDir)
extract_zip(nonFlakyZip, nonFlakyDir)

dataPointsFlaky = getDataPoints(flakyDir)
dataPointsNonFlaky = getDataPoints(nonFlakyDir)
dataPoints = dataPointsFlaky + dataPointsNonFlaky

print(f"Number of flaky documents: {len(dataPointsFlaky)}")
print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
print(f"Total number of documents: {len(dataPoints)}")

if len(dataPoints) == 0:
    raise ValueError("No documents available for vectorization. Please check the input directories.")

# Create labels: 1 for flaky, 0 for non-flaky
dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))


Number of flaky documents: 45
Number of non-flaky documents: 243
Total number of documents: 288


## KNN

In [2]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def runKNNWith(dataPoints, dataLabelsList, outDir, n_splits):
    # Define parameter grid
    param_grid = {
        'pca__n_components': [150],             # PCA components
        'knn__n_neighbors': [3],                  # Number of neighbors for KNN
        'knn__weights': ['distance'],               # Weighting options
        'knn__metric': ['euclidean']                 # Distance metrics
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, PCA, and KNN with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
                ('knn', KNeighborsClassifier(
                    n_neighbors=param_dict['knn__n_neighbors'],
                    weights=param_dict['knn__weights'],
                    metric=param_dict['knn__metric']))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics and standard deviations over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            accuracies = [tm['accuracy'] for tm in threshold_metrics]
            precisions = [tm['precision'] for tm in threshold_metrics]
            recalls = [tm['recall'] for tm in threshold_metrics]
            f1s = [tm['f1'] for tm in threshold_metrics]
            mccs = [tm['mcc'] for tm in threshold_metrics]
            
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies, ddof=1)
            avg_precision = np.mean(precisions)
            std_precision = np.std(precisions, ddof=1)
            avg_recall = np.mean(recalls)
            std_recall = np.std(recalls, ddof=1)
            avg_f1 = np.mean(f1s)
            std_f1 = np.std(f1s, ddof=1)
            avg_mcc = np.mean(mccs)
            std_mcc = np.std(mccs, ddof=1)
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'std_accuracy': std_accuracy,
                'precision': avg_precision,
                'std_precision': std_precision,
                'recall': avg_recall,
                'std_recall': std_recall,
                'f1': avg_f1,
                'std_f1': std_f1,
                'mcc': avg_mcc,
                'std_mcc': std_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"knn-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    std_f1_best = best_result['std_f1']
    final_mcc = best_result['mcc']
    std_mcc_best = best_result['std_mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1:.4f} (Std Dev: {std_f1_best:.4f})")
    print(f"Final MCC: {final_mcc:.4f} (Std Dev: {std_mcc_best:.4f})")
    
    return best_params, best_threshold, best_f1, std_f1_best, final_mcc, std_mcc_best

# Main Execution for 5-Fold Cross-Validation
outDir = "results"
os.makedirs(outDir, exist_ok=True)

# Run KNN with Threshold adjustment using 5-fold cross-validation
print("\nStarting KNN analysis with PCA and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, std_f1_best_5folds, final_mcc_5folds, std_mcc_best_5folds = runKNNWith(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for KNN with PCA and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds:.4f} (Std Dev: {std_f1_best_5folds:.4f})")
print(f"Final MCC: {final_mcc_5folds:.4f} (Std Dev: {std_mcc_best_5folds:.4f})")



Starting KNN analysis with PCA and Threshold adjustment for 5-fold cross-validation...
Results saved to: results\knn-threshold-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'pca__n_components': 150, 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'knn__metric': 'euclidean'}
Best Threshold: 0.4
Best F1 Score: 0.5723 (Std Dev: 0.1597)
Final MCC: 0.5799 (Std Dev: 0.1308)

Best results for KNN with PCA and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'pca__n_components': 150, 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'knn__metric': 'euclidean'}
Best Threshold: 0.4
Best F1 Score: 0.5723 (Std Dev: 0.1597)
Final MCC: 0.5799 (Std Dev: 0.1308)


## SVM

In [None]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def runSVMWithSMOTE_PCA(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    
    # Define parameter grid
    param_grid = {
        'pca__n_components': [220],          # PCA components
        'svm__C': [0.01],                    # Regularization parameter
        'svm__kernel': ['linear']            # Kernel types
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, PCA and SVM with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
                ('svm', SVC(
                    C=param_dict['svm__C'],
                    kernel=param_dict['svm__kernel'],
                    probability=True,
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics and standard deviations over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            accuracies = [tm['accuracy'] for tm in threshold_metrics]
            precisions = [tm['precision'] for tm in threshold_metrics]
            recalls = [tm['recall'] for tm in threshold_metrics]
            f1s = [tm['f1'] for tm in threshold_metrics]
            mccs = [tm['mcc'] for tm in threshold_metrics]
            
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies, ddof=1)
            avg_precision = np.mean(precisions)
            std_precision = np.std(precisions, ddof=1)
            avg_recall = np.mean(recalls)
            std_recall = np.std(recalls, ddof=1)
            avg_f1 = np.mean(f1s)
            std_f1 = np.std(f1s, ddof=1)
            avg_mcc = np.mean(mccs)
            std_mcc = np.std(mccs, ddof=1)
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'std_accuracy': std_accuracy,
                'precision': avg_precision,
                'std_precision': std_precision,
                'recall': avg_recall,
                'std_recall': std_recall,
                'f1': avg_f1,
                'std_f1': std_f1,
                'mcc': avg_mcc,
                'std_mcc': std_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"svm-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    std_f1_best = best_result['std_f1']
    final_mcc = best_result['mcc']
    std_mcc_best = best_result['std_mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1:.4f} (Std Dev: {std_f1_best:.4f})")
    print(f"Final MCC: {final_mcc:.4f} (Std Dev: {std_mcc_best:.4f})")
    
    return best_params, best_threshold, best_f1, std_f1_best, final_mcc, std_mcc_best

# Main Execution
outDir = "results"
os.makedirs(outDir, exist_ok=True)

# Run SVM with SMOTE, PCA, and Threshold adjustment using 5-fold cross-validation
print("\nStarting SVM analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, std_f1_best_5folds, final_mcc_5folds, std_mcc_best_5folds = runSVMWithSMOTE_PCA(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for SVM with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds:.4f} (Std Dev: {std_f1_best_5folds:.4f})")
print(f"Final MCC: {final_mcc_5folds:.4f} (Std Dev: {std_mcc_best_5folds:.4f})")



Starting SVM analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...
Results saved to: results\svm-threshold-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'pca__n_components': 220, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best Threshold: 0.2
Best F1 Score: 0.7915 (Std Dev: 0.1280)
Final MCC: 0.7722 (Std Dev: 0.1260)

Best results for SVM with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'pca__n_components': 220, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best Threshold: 0.2
Best F1 Score: 0.7915 (Std Dev: 0.1280)
Final MCC: 0.7722 (Std Dev: 0.1260)


: 

## XGB

In [9]:
import os
import time
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def runXGBWithSMOTE_PCA(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    
    # Define parameter grid
    param_grid = {
        'pca__n_components': [150],     # PCA components
        'xgb__n_estimators': [150],          # Number of boosting rounds
        'xgb__max_depth': [5],               # Maximum depth of a tree
        'xgb__learning_rate': [ 0.3],   # Learning rate
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluateF
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, PCA, SMOTE, and XGBClassifier with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
                ('xgb', XGBClassifier(
                    n_estimators=param_dict['xgb__n_estimators'],
                    max_depth=param_dict['xgb__max_depth'],
                    learning_rate=param_dict['xgb__learning_rate'],
                    eval_metric='logloss',
                    use_label_encoder=False,
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            avg_accuracy = np.mean([tm['accuracy'] for tm in threshold_metrics])
            avg_precision = np.mean([tm['precision'] for tm in threshold_metrics])
            avg_recall = np.mean([tm['recall'] for tm in threshold_metrics])
            avg_f1 = np.mean([tm['f1'] for tm in threshold_metrics])
            avg_mcc = np.mean([tm['mcc'] for tm in threshold_metrics])
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'precision': avg_precision,
                'recall': avg_recall,
                'f1': avg_f1,
                'mcc': avg_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"xgb-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    final_mcc = best_result['mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1}")
    print(f"Final MCC: {final_mcc}")
    
    return best_params, best_threshold, best_f1, final_mcc

# Main Execution for 5-Fold Cross-Validation
outDir = "results"
os.makedirs(outDir, exist_ok=True)

# Run XGBoost with SMOTE, PCA, and Threshold adjustment using 5-fold cross-validation
print("\nStarting XGBoost analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, final_mcc_5folds = runXGBWithSMOTE_PCA(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for XGBoost with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting XGBoost analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Results saved to: results\xgb-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'pca__n_components': 150, 'xgb__n_estimators': 150, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.3}
Best Threshold: 0.1
Best F1 Score: 0.7137106357694594
Final MCC: 0.6657931014619443

Best results for XGBoost with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'pca__n_components': 150, 'xgb__n_estimators': 150, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.3}
Best Threshold: 0.1
Best F1 Score: 0.7137106357694594
Final MCC: 0.6657931014619443
