In [1]:


import os
import time
import zipfile
import numpy as np
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import SparseRandomProjection

from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer) 
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
import warnings

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList



###############################################################################
# Data Extraction

# Parameters setup
flakyZip = "Dataset/flaky_files.zip"
nonFlakyZip = "Dataset/nonflaky_files.zip"
extractDir = "extracted"
os.makedirs(extractDir, exist_ok=True)

best_hyperparameter = 'results/best_hyperparameter'
os.makedirs(best_hyperparameter,exist_ok=True)
# Extract the zip files
# Extract and read data once for  non-flaky combination
flakyDir = os.path.join(extractDir, 'flaky')
nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
os.makedirs(flakyDir, exist_ok=True)
os.makedirs(nonFlakyDir, exist_ok=True)

extract_zip(flakyZip, flakyDir)
extract_zip(nonFlakyZip, nonFlakyDir)

dataPointsFlaky = getDataPoints(flakyDir)
dataPointsNonFlaky = getDataPoints(nonFlakyDir)
dataPoints = dataPointsFlaky + dataPointsNonFlaky

# Print the number of datasets for  combination
print(f"Number of flaky documents ( combination): {len(dataPointsFlaky)}")
print(f"Number of non-flaky documents ( combination): {len(dataPointsNonFlaky)}")
print(f"Total number of documents ( combination): {len(dataPoints)}")

dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

warnings.filterwarnings('ignore')

Number of flaky documents ( combination): 45
Number of non-flaky documents ( combination): 243
Total number of documents ( combination): 288


## KNN

In [2]:
import os
import time
import numpy as np
import pandas as pd
import json  # Import json to load best hyperparameters
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def KNNwithSMOTEThreshold(dataPoints, dataLabelsList, outDir, n_splits):
    # Define parameter grid

    # Load best hyperparameters from JSON file
    with open(os.path.join(best_hyperparameter, 'best_params_knn.json'), 'r') as f:
        best_params_knn = json.load(f)

    param_grid = {
        'pca__n_components': [best_params_knn['pca__n_components']],             
        'knn__n_neighbors': [best_params_knn['knn__n_neighbors']],                  
        'knn__weights': [best_params_knn['knn__weights']],               
        'knn__metric': [best_params_knn['knn__metric']]                 
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, PCA, SMOTE, and KNN with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('smote', SMOTE(random_state=42)),
                ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
                ('knn', KNeighborsClassifier(
                    n_neighbors=param_dict['knn__n_neighbors'],
                    weights=param_dict['knn__weights'],
                    metric=param_dict['knn__metric']))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics and standard deviations over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            # Extract metrics per fold
            accuracies = [tm['accuracy'] for tm in threshold_metrics]
            precisions = [tm['precision'] for tm in threshold_metrics]
            recalls = [tm['recall'] for tm in threshold_metrics]
            f1s = [tm['f1'] for tm in threshold_metrics]
            mccs = [tm['mcc'] for tm in threshold_metrics]
            
            # Calculate mean and standard deviation
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            avg_precision = np.mean(precisions)
            std_precision = np.std(precisions)
            avg_recall = np.mean(recalls)
            std_recall = np.std(recalls)
            avg_f1 = np.mean(f1s)
            std_f1 = np.std(f1s)
            avg_mcc = np.mean(mccs)
            std_mcc = np.std(mccs)
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'std_accuracy': std_accuracy,
                'precision': avg_precision,
                'std_precision': std_precision,
                'recall': avg_recall,
                'std_recall': std_recall,
                'f1': avg_f1,
                'std_f1': std_f1,
                'mcc': avg_mcc,
                'std_mcc': std_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"knn-smote-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    std_f1 = best_result['std_f1']
    final_mcc = best_result['mcc']
    std_mcc = best_result['std_mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1} (Std Dev: {std_f1})")
    print(f"Final MCC: {final_mcc} (Std Dev: {std_mcc})")
    
    return best_params, best_threshold, best_f1, std_f1, final_mcc, std_mcc

# Main Execution for 5-Fold Cross-Validation
outDir = "results/hybrid"
os.makedirs(outDir, exist_ok=True)

# Run KNN with SMOTE, PCA, and Threshold adjustment using 5-fold cross-validation
print("\nStarting KNN analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, std_f1_5folds, final_mcc_5folds, std_mcc_5folds = KNNwithSMOTEThreshold(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for KNN with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds} (Std Dev: {std_f1_5folds})")
print(f"Final MCC: {final_mcc_5folds} (Std Dev: {std_mcc_5folds})")



Starting KNN analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...
Results saved to: results/hybrid\knn-smote-threshold-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'pca__n_components': 200, 'knn__n_neighbors': 7, 'knn__weights': 'distance', 'knn__metric': 'cosine'}
Best Threshold: 0.7000000000000001
Best F1 Score: 0.6229509671614935 (Std Dev: 0.1556934194689674)
Final MCC: 0.5801774825249082 (Std Dev: 0.16143406592047685)

Best results for KNN with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'pca__n_components': 200, 'knn__n_neighbors': 7, 'knn__weights': 'distance', 'knn__metric': 'cosine'}
Best Threshold: 0.7000000000000001
Best F1 Score: 0.6229509671614935 (Std Dev: 0.1556934194689674)
Final MCC: 0.5801774825249082 (Std Dev: 0.16143406592047685)


## SVM

In [3]:
import os
import time
import numpy as np
import pandas as pd
import json  # Import json to load best hyperparameters
from sklearn.svm import SVC
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def SVM_SMOTE_Threshold(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    
    # Load best hyperparameters from JSON file
    with open(os.path.join(best_hyperparameter, 'best_params_svm.json'), 'r') as f:
        best_params_svm = json.load(f)
    # Define parameter grid
    param_grid = {
        'pca__n_components': [best_params_svm['pca__n_components']],          # PCA components
        'svm__C': [best_params_svm['svm__C']],                    # Regularization parameter
        'svm__kernel': [best_params_svm['svm__kernel']]            # Kernel types
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, PCA, SMOTE, and SVM with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('smote', SMOTE(random_state=42)),
                ('pca', PCA(n_components=param_dict['pca__n_components'], random_state=42)),
                ('svm', SVC(
                    C=param_dict['svm__C'],
                    kernel=param_dict['svm__kernel'],
                    probability=True,
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics and standard deviations over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            # Extract metrics per fold
            accuracies = [tm['accuracy'] for tm in threshold_metrics]
            precisions = [tm['precision'] for tm in threshold_metrics]
            recalls = [tm['recall'] for tm in threshold_metrics]
            f1s = [tm['f1'] for tm in threshold_metrics]
            mccs = [tm['mcc'] for tm in threshold_metrics]
            
            # Calculate mean and standard deviation
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            avg_precision = np.mean(precisions)
            std_precision = np.std(precisions)
            avg_recall = np.mean(recalls)
            std_recall = np.std(recalls)
            avg_f1 = np.mean(f1s)
            std_f1 = np.std(f1s)
            avg_mcc = np.mean(mccs)
            std_mcc = np.std(mccs)
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'std_accuracy': std_accuracy,
                'precision': avg_precision,
                'std_precision': std_precision,
                'recall': avg_recall,
                'std_recall': std_recall,
                'f1': avg_f1,
                'std_f1': std_f1,
                'mcc': avg_mcc,
                'std_mcc': std_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"svm-smote-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    std_f1 = best_result['std_f1']
    final_mcc = best_result['mcc']
    std_mcc = best_result['std_mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1} (Std Dev: {std_f1})")
    print(f"Final MCC: {final_mcc} (Std Dev: {std_mcc})")
    
    return best_params, best_threshold, best_f1, std_f1, final_mcc, std_mcc

# Main Execution
outDir = "results/hybrid"
os.makedirs(outDir, exist_ok=True)

# Run SVM with SMOTE, PCA, and Threshold adjustment using 5-fold cross-validation
print("\nStarting SVM analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, std_f1_5folds, final_mcc_5folds, std_mcc_5folds = SVM_SMOTE_Threshold(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for SVM with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds} (Std Dev: {std_f1_5folds})")
print(f"Final MCC: {final_mcc_5folds} (Std Dev: {std_mcc_5folds})")



Starting SVM analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...
Results saved to: results/hybrid\svm-smote-threshold-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'pca__n_components': 180, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best Threshold: 0.1
Best F1 Score: 0.7191387559808613 (Std Dev: 0.05468294599571738)
Final MCC: 0.6671247721541482 (Std Dev: 0.06638462910356159)

Best results for SVM with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'pca__n_components': 180, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best Threshold: 0.1
Best F1 Score: 0.7191387559808613 (Std Dev: 0.05468294599571738)
Final MCC: 0.6671247721541482 (Std Dev: 0.06638462910356159)


## XGB

In [4]:
import os
import time
import numpy as np
import pandas as pd
import json
from xgboost import XGBClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def runXGBWithSMOTE_PCA(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    with open(os.path.join(best_hyperparameter, 'best_params_xgb.json'), 'r') as f:
        best_params_xgb = json.load(f)
    # Define parameter grid
    param_grid = {
        'xgb__n_estimators': [best_params_xgb['xgb__n_estimators']],     # Number of boosting rounds
        'xgb__max_depth': [best_params_xgb['xgb__max_depth']],           # Maximum depth of a tree
        'xgb__learning_rate': [best_params_xgb['xgb__learning_rate']],   # Learning rate
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, PCA, SMOTE, and XGBClassifier with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),                                
                ('smote', SMOTE(random_state=42)),
                ('xgb', XGBClassifier(
                    n_estimators=param_dict['xgb__n_estimators'],
                    max_depth=param_dict['xgb__max_depth'],
                    learning_rate=param_dict['xgb__learning_rate'],
                    eval_metric='logloss',
                    use_label_encoder=False,
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics and standard deviations over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            # Extract metrics per fold
            accuracies = [tm['accuracy'] for tm in threshold_metrics]
            precisions = [tm['precision'] for tm in threshold_metrics]
            recalls = [tm['recall'] for tm in threshold_metrics]
            f1s = [tm['f1'] for tm in threshold_metrics]
            mccs = [tm['mcc'] for tm in threshold_metrics]
            
            # Calculate mean and standard deviation
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies, ddof=1)  # Use ddof=1 for sample standard deviation
            avg_precision = np.mean(precisions)
            std_precision = np.std(precisions, ddof=1)
            avg_recall = np.mean(recalls)
            std_recall = np.std(recalls, ddof=1)
            avg_f1 = np.mean(f1s)
            std_f1 = np.std(f1s, ddof=1)
            avg_mcc = np.mean(mccs)
            std_mcc = np.std(mccs, ddof=1)
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'std_accuracy': std_accuracy,
                'precision': avg_precision,
                'std_precision': std_precision,
                'recall': avg_recall,
                'std_recall': std_recall,
                'f1': avg_f1,
                'std_f1': std_f1,
                'mcc': avg_mcc,
                'std_mcc': std_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"xgb-smote-pca-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    std_f1 = best_result['std_f1']
    final_mcc = best_result['mcc']
    std_mcc = best_result['std_mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1:.4f} ± {std_f1:.4f}")
    print(f"Final MCC: {final_mcc:.4f} ± {std_mcc:.4f}")
    
    return best_params, best_threshold, best_f1, std_f1, final_mcc, std_mcc

# Main Execution for 5-Fold Cross-Validation
outDir = "results/hybrid"
os.makedirs(outDir, exist_ok=True)

# Run XGBoost with SMOTE, PCA, and Threshold adjustment using 5-fold cross-validation
print("\nStarting XGBoost analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, std_f1_5folds, final_mcc_5folds, std_mcc_5folds = runXGBWithSMOTE_PCA(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for XGBoost with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds:.4f} ± {std_f1_5folds:.4f}")
print(f"Final MCC: {final_mcc_5folds:.4f} ± {std_mcc_5folds:.4f}")



Starting XGBoost analysis with SMOTE, PCA, and Threshold adjustment for 5-fold cross-validation...
Results saved to: results/hybrid\xgb-smote-pca-threshold-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'xgb__n_estimators': 200, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.01}
Best Threshold: 0.5
Best F1 Score: 0.8839 ± 0.1068
Final MCC: 0.8765 ± 0.1051

Best results for XGBoost with SMOTE, PCA, and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'xgb__n_estimators': 200, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.01}
Best Threshold: 0.5
Best F1 Score: 0.8839 ± 0.1068
Final MCC: 0.8765 ± 0.1051


## Random Forest

In [5]:
import os
import time
import numpy as np
import pandas as pd
import json  # Import json to load best hyperparameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def runRFWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    with open(os.path.join("results/best_hyperparameter", 'best_params_rf.json'), 'r') as f:
        best_params_rf = json.load(f)

    # Define parameter grid
    param_grid = {
        'rf__n_estimators': [best_params_rf['rf__n_estimators']],          # Number of trees in the forest
        'rf__max_depth': [best_params_rf['rf__max_depth']],             # Maximum depth of the tree
        'rf__min_samples_split': [best_params_rf['rf__min_samples_split']],      # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [best_params_rf['rf__min_samples_leaf']],       # Minimum number of samples required at a leaf node
        'rf__criterion': [best_params_rf['rf__criterion']]         # Function to measure the quality of a split
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, SMOTE, and Random Forest with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('smote', SMOTE(random_state=42)),
                ('rf', RandomForestClassifier(
                    n_estimators=param_dict['rf__n_estimators'],
                    max_depth=param_dict['rf__max_depth'],
                    min_samples_split=param_dict['rf__min_samples_split'],
                    min_samples_leaf=param_dict['rf__min_samples_leaf'],
                    criterion=param_dict['rf__criterion'],
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics and standard deviations over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            # Extract metrics per fold
            accuracies = [tm['accuracy'] for tm in threshold_metrics]
            precisions = [tm['precision'] for tm in threshold_metrics]
            recalls = [tm['recall'] for tm in threshold_metrics]
            f1s = [tm['f1'] for tm in threshold_metrics]
            mccs = [tm['mcc'] for tm in threshold_metrics]
            
            # Calculate mean and standard deviation
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            avg_precision = np.mean(precisions)
            std_precision = np.std(precisions)
            avg_recall = np.mean(recalls)
            std_recall = np.std(recalls)
            avg_f1 = np.mean(f1s)
            std_f1 = np.std(f1s)
            avg_mcc = np.mean(mccs)
            std_mcc = np.std(mccs)
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'std_accuracy': std_accuracy,
                'precision': avg_precision,
                'std_precision': std_precision,
                'recall': avg_recall,
                'std_recall': std_recall,
                'f1': avg_f1,
                'std_f1': std_f1,
                'mcc': avg_mcc,
                'std_mcc': std_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"rf-smote-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    std_f1 = best_result['std_f1']
    final_mcc = best_result['mcc']
    std_mcc = best_result['std_mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1} (Std Dev: {std_f1})")
    print(f"Final MCC: {final_mcc} (Std Dev: {std_mcc})")
    
    return best_params, best_threshold, best_f1, std_f1, final_mcc, std_mcc

# Main Execution for 5-Fold Cross-Validation
outDir = "results/hybrid"
os.makedirs(outDir, exist_ok=True)

# Run Random Forest with SMOTE and Threshold adjustment using 5-fold cross-validation
print("\nStarting Random Forest analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, std_f1_5folds, final_mcc_5folds, std_mcc_5folds = runRFWithSMOTE(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for Random Forest with SMOTE and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds} (Std Dev: {std_f1_5folds})")
print(f"Final MCC: {final_mcc_5folds} (Std Dev: {std_mcc_5folds})")



Starting Random Forest analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...
Results saved to: results/hybrid\rf-smote-threshold-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'rf__n_estimators': 100, 'rf__max_depth': 20, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__criterion': 'entropy'}
Best Threshold: 0.5
Best F1 Score: 0.822015823873409 (Std Dev: 0.04193302466637172)
Final MCC: 0.7898138504372108 (Std Dev: 0.04984274546216977)

Best results for Random Forest with SMOTE and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'rf__n_estimators': 100, 'rf__max_depth': 20, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__criterion': 'entropy'}
Best Threshold: 0.5
Best F1 Score: 0.822015823873409 (Std Dev: 0.04193302466637172)
Final MCC: 0.7898138504372108 (Std Dev: 0.04984274546216977)


## Decision Tree

In [6]:
import os
import time
import numpy as np
import pandas as pd
import json  # Import json to load best hyperparameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_extraction.text import CountVectorizer
from itertools import product

def runDTWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    with open(os.path.join("results/best_hyperparameter", 'best_params_dt.json'), 'r') as f:
        best_params_dt = json.load(f)

    # Define parameter grid
    param_grid = {
        'dt__max_depth': [best_params_dt['dt__max_depth']],
        'dt__min_samples_split': [best_params_dt['dt__min_samples_split']],
        'dt__min_samples_leaf': [best_params_dt['dt__min_samples_leaf']],
        'dt__criterion': [best_params_dt['dt__criterion']],
        'dt__max_features': [best_params_dt['dt__max_features']]
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with CountVectorizer, SMOTE, and Decision Tree with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('smote', SMOTE(random_state=42)),
                ('dt', DecisionTreeClassifier(
                    max_depth=param_dict['dt__max_depth'],
                    min_samples_split=param_dict['dt__min_samples_split'],
                    min_samples_leaf=param_dict['dt__min_samples_leaf'],
                    criterion=param_dict['dt__criterion'],
                    max_features=param_dict['dt__max_features'],
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics and standard deviations over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            # Extract metrics per fold
            accuracies = [tm['accuracy'] for tm in threshold_metrics]
            precisions = [tm['precision'] for tm in threshold_metrics]
            recalls = [tm['recall'] for tm in threshold_metrics]
            f1s = [tm['f1'] for tm in threshold_metrics]
            mccs = [tm['mcc'] for tm in threshold_metrics]
            
            # Calculate mean and standard deviation
            avg_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            avg_precision = np.mean(precisions)
            std_precision = np.std(precisions)
            avg_recall = np.mean(recalls)
            std_recall = np.std(recalls)
            avg_f1 = np.mean(f1s)
            std_f1 = np.std(f1s)
            avg_mcc = np.mean(mccs)
            std_mcc = np.std(mccs)
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'std_accuracy': std_accuracy,
                'precision': avg_precision,
                'std_precision': std_precision,
                'recall': avg_recall,
                'std_recall': std_recall,
                'f1': avg_f1,
                'std_f1': std_f1,
                'mcc': avg_mcc,
                'std_mcc': std_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"dt-smote-threshold-results-{n_splits}-folds.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    std_f1 = best_result['std_f1']
    final_mcc = best_result['mcc']
    std_mcc = best_result['std_mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1} (Std Dev: {std_f1})")
    print(f"Final MCC: {final_mcc} (Std Dev: {std_mcc})")
    
    return best_params, best_threshold, best_f1, std_f1, final_mcc, std_mcc

# Main Execution for 5-Fold Cross-Validation
outDir = "results/hybrid"
os.makedirs(outDir, exist_ok=True)

# Run Decision Tree with SMOTE and Threshold adjustment using 5-fold cross-validation
print("\nStarting Decision Tree analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, std_f1_5folds, final_mcc_5folds, std_mcc_5folds = runDTWithSMOTE(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for Decision Tree with SMOTE and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds} (Std Dev: {std_f1_5folds})")
print(f"Final MCC: {final_mcc_5folds} (Std Dev: {std_mcc_5folds})")



Starting Decision Tree analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...
Results saved to: results/hybrid\dt-smote-threshold-results-5-folds.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__criterion': 'gini', 'dt__max_features': None}
Best Threshold: 0.1
Best F1 Score: 0.8760130718954249 (Std Dev: 0.09104075452318809)
Final MCC: 0.8639675781078721 (Std Dev: 0.09751197920410426)

Best results for Decision Tree with SMOTE and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__criterion': 'gini', 'dt__max_features': None}
Best Threshold: 0.1
Best F1 Score: 0.8760130718954249 (Std Dev: 0.09104075452318809)
Final MCC: 0.8639675781078721 (Std Dev: 0.09751197920410426)
