## Data prep

In [3]:
import os
import time
import zipfile
import numpy as np
import random

from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import os
import time
import math



# Set global random seeds
random.seed(42)
np.random.seed(42)

# Set environment variables for deterministic behavior
os.environ['PYTHONHASHSEED'] = '42'
os.environ['OMP_NUM_THREADS'] = '1'  # For OpenMP
os.environ['MKL_NUM_THREADS'] = '1'  # For MKL
os.environ['OPENBLAS_NUM_THREADS'] = '1'  # For OpenBLAS
os.environ['NUMEXPR_NUM_THREADS'] = '1'  # For NumExpr
#os.environ['VECLIB_MAXIMUM_THREADS'] = '1'  # For macOS Accelerate

from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        dirs.sort()  # Sort directories to ensure consistent traversal
        files.sort()  # Sort files to ensure consistent order
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList



###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "smote-results"
os.makedirs(outDir, exist_ok=True)

# Run KNN with SMOTE and PCA using 5-fold cross-validation
print("\nStarting KNN analysis with SMOTE and PCA for 5-fold cross-validation...")
best_params_5folds, best_f1_5folds, final_mcc_5folds = runKNNWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for KNN with SMOTE and PCA 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")


Starting KNN analysis with SMOTE and PCA for 5-fold cross-validation...


NameError: name 'runKNNWithSMOTE' is not defined

## KNN with SMOTE|

In [23]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# KNN with SMOTE and PCA

def runKNNWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline with PCA, SMOTE, and KNN
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer
        ('pca', PCA(random_state=42)),                     # PCA for dimensionality reduction
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('knn', KNeighborsClassifier()),                   # KNN classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'pca__n_components': [150, 180, 200, 220],             # PCA components
        'knn__n_neighbors': [3, 5, 7, 9, 11],     # Number of neighbors for KNN
        'knn__weights': ['uniform', 'distance'],          # Weighting options
        'knn__metric': ['euclidean', 'cosine']            # Distance metrics
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring,
        refit='f1', verbose=1, return_train_score=True
    )

    # Record the start time
    start_time = time.perf_counter()

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Record the end time
    end_time = time.perf_counter()
    total_time = end_time - start_time

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE and PCA: {best_params}")
    print(f"Best MCC Score from cross-validation: {best_f1_cv}")

    # Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({
            'n_components': results['params'][idx].get('pca__n_components'),
            'n_neighbors': results['params'][idx].get('knn__n_neighbors'),
            'weights': results['params'][idx].get('knn__weights'),
            'metric': results['params'][idx].get('knn__metric'),
            'accuracy': results.get('mean_test_accuracy', [None])[idx],
            'precision': results.get('mean_test_precision', [None])[idx],
            'recall': results.get('mean_test_recall', [None])[idx],
            'f1': results.get('mean_test_f1', [None])[idx],
            'mcc': results.get('mean_test_mcc', [None])[idx],
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile_folds = os.path.join(outDir, f"knn-smote-pca-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile_folds, index=False)

    print(f"Per-fold metrics saved to: {outFile_folds}")

    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]  # Extract final MCC

    # Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the final results to a CSV file
    outFile_final = os.path.join(outDir, f"knn-results-{n_splits}-folds.csv")
    with open(outFile_final, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"KNN analysis completed for {n_splits}-folds with SMOTE and PCA. Results saved to: {outFile_final}")

    return best_params, final_f1, final_mcc

###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "smote-results"
os.makedirs(outDir, exist_ok=True)

# Run KNN with SMOTE and PCA using 5-fold cross-validation
print("\nStarting KNN analysis with SMOTE and PCA for 5-fold cross-validation...")
best_params_5folds, best_f1_5folds, final_mcc_5folds = runKNNWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for KNN with SMOTE and PCA 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting KNN analysis with SMOTE and PCA for 5-fold cross-validation...
Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best Parameters with SMOTE and PCA: {'knn__metric': 'cosine', 'knn__n_neighbors': 11, 'knn__weights': 'distance', 'pca__n_components': 150}
Best MCC Score from cross-validation: 0.5832507361919127
Per-fold metrics saved to: smote-results\knn-smote-pca-fold-results-5-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.611111111111111
Final Recall: 0.6
Final Accuracy: 0.8750151240169389
Final F1 Score: 0.5832507361919127
Final MCC: 0.5250008288463672
KNN analysis completed for 5-folds with SMOTE and PCA. Results saved to: smote-results\knn-results-5-folds.csv

Best results for KNN with SMOTE and PCA 5-fold cross-validation:
Best Parameters: {'knn__metric': 'cosine', 'knn__n_neighbors': 11, 'knn__weights': 'distance', 'pca__n_components': 150}
Best F1 Score: 0.5832507361919127
Final MCC: 0.5250008288463672


## SVM

In [3]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# SVM with SMOTE and PCA

def runSVMWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()

    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline with CountVectorizer, PCA, SMOTE, and SVM
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer
        ('pca', PCA(random_state=42)),                     # Dimensionality reduction
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('svm', SVC(probability=True))                     # SVM classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'pca__n_components': [150, 180, 200, 220],          # PCA components
        'svm__C': [0.01, 0.1, 1.0, 10.0, 100.0],            # Regularization parameter
        'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'] # Kernel types
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE and PCA: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({
            'n_components': results['params'][idx].get('pca__n_components'),
            'C': results['params'][idx].get('svm__C'),
            'kernel': results['params'][idx].get('svm__kernel'),
            'accuracy': results.get(f'mean_test_accuracy', [None])[idx],
            'precision': results.get(f'mean_test_precision', [None])[idx],
            'recall': results.get(f'mean_test_recall', [None])[idx],
            'f1': results.get(f'mean_test_f1', [None])[idx],
            'mcc': results.get(f'mean_test_mcc', [None])[idx],
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile_folds = os.path.join(outDir, f"svm-smote-pca-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile_folds, index=False)

    print(f"Per-fold metrics saved to: {outFile_folds}")

    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the final results to a CSV file
    outFile_final = os.path.join(outDir, f"svm-results-{n_splits}-folds.csv")
    with open(outFile_final, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"SVM analysis completed for {n_splits}-folds with SMOTE and PCA. Results saved to: {outFile_final}")

    return best_params, final_f1, final_mcc

###############################################################################
# Main Execution

outDir = "smote-results"
os.makedirs(outDir, exist_ok=True)

# Run SVM with SMOTE and PCA using 5-fold cross-validation
print("\nStarting SVM analysis with SMOTE and PCA for 5-fold cross-validation...")
best_params_5folds, best_f1_5folds, final_mcc_5folds = runSVMWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for SVM with SMOTE and PCA 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")



Starting SVM analysis with SMOTE and PCA for 5-fold cross-validation...
Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best Parameters with SMOTE and PCA: {'pca__n_components': 220, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best F1 Score from cross-validation: 0.6914602683178535
Per-fold metrics saved to: smote-results\svm-smote-pca-fold-results-5-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.8445238095238097
Final Recall: 0.6222222222222222
Final Accuracy: 0.9201451905626135
Final F1 Score: 0.6914602683178535
Final MCC: 0.6720359085671346
SVM analysis completed for 5-folds with SMOTE and PCA. Results saved to: smote-results\svm-results-5-folds.csv

Best results for SVM with SMOTE and PCA 5-fold cross-validation:
Best Parameters: {'pca__n_components': 220, 'svm__C': 0.01, 'svm__kernel': 'linear'}
Best F1 Score: 0.6914602683178535


## XGB

In [43]:
import os
import time
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# XGBoost with SMOTE, PCA, and CountVectorizer

def runXGBWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()

    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline with CountVectorizer, PCA, SMOTE, and XGBoost
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),               # Vectorizer
        ('pca', PCA(random_state=42)),                                                 # PCA for dimensionality reduction
        ('smote', SMOTE(random_state=42)),                              # SMOTE for oversampling
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42))  # XGBoost classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'pca__n_components': [150, 180, 200, 220],        # PCA components
        'xgb__n_estimators': [100, 150, 200],             # Number of boosting rounds
        'xgb__max_depth': [3, 5, 7, 10],                  # Maximum depth of a tree
        'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],      # Learning rate
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE, PCA, and CountVectorizer: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combination
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({
            'n_components': results['params'][idx].get('pca__n_components'),
            'n_estimators': results['params'][idx].get('xgb__n_estimators'),
            'max_depth': results['params'][idx].get('xgb__max_depth'),
            'learning_rate': results['params'][idx].get('xgb__learning_rate'),
            'accuracy': results.get('mean_test_accuracy', [None])[idx],
            'precision': results.get('mean_test_precision', [None])[idx],
            'recall': results.get('mean_test_recall', [None])[idx],
            'f1': results.get('mean_test_f1', [None])[idx],
            'mcc': results.get('mean_test_mcc', [None])[idx],
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile_folds = os.path.join(outDir, f"xgb-smote-pca-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile_folds, index=False)

    print(f"Per-fold metrics saved to: {outFile_folds}")

    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile_final = os.path.join(outDir, f"xgb-results-{n_splits}-folds.csv")
    with open(outFile_final, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"XGBoost analysis completed for {n_splits}-folds with SMOTE, PCA, and CountVectorizer. Results saved to: {outFile_final}")

    return best_params, final_f1, final_mcc

###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "smote-results"
os.makedirs(outDir, exist_ok=True)

# Run XGBoost with SMOTE, PCA, and CountVectorizer using 5-fold cross-validation
print("\nStarting XGBoost analysis with SMOTE, PCA, and CountVectorizer for 5-fold cross-validation...")
best_params_5folds, best_f1_5folds, final_mcc_5folds = runXGBWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for XGBoost with SMOTE, PCA, and CountVectorizer 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting XGBoost analysis with SMOTE, PCA, and CountVectorizer for 5-fold cross-validation...
Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best Parameters with SMOTE, PCA, and CountVectorizer: {'pca__n_components': 150, 'xgb__learning_rate': 0.5, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
Best F1 Score from cross-validation: 0.7552324438547349
Per-fold metrics saved to: smote-results/xgb-smote-pca-fold-results-5-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.8306349206349207
Final Recall: 0.711111111111111
Final Accuracy: 0.9304900181488203
Final F1 Score: 0.7552324438547349
Final MCC: 0.72509493338034
XGBoost analysis completed for 5-folds with SMOTE, PCA, and CountVectorizer. Results saved to: smote-results/xgb-results-5-folds.csv

Best results for XGBoost with SMOTE, PCA, and CountVectorizer 5-fold cross-validation:
Best Parameters: {'pca__n_components': 150, 'xgb__learning_rate': 0.5, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
Best F1 Score

## Random Forest

In [3]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# Random Forest with SMOTE

def runRFWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()

    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline with SMOTE, and Random Forest
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('rf', RandomForestClassifier(random_state=42))    # Random Forest classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'rf__n_estimators': [50, 100, 200],                 # Number of trees in the forest
        'rf__max_depth': [10, 20, 30],                     # Maximum depth of the tree
        'rf__min_samples_split': [5, 10],                   # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [2, 5],                    # Minimum number of samples required at a leaf node
        'rf__criterion': ['gini', 'entropy']               # Function to measure the quality of a split
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE and PCA: {best_params}")
    print(f"Best F1 Score from cross-validation: {best_f1_cv}")

    # Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({

            'n_estimators': results['params'][idx].get('rf__n_estimators'),
            'max_depth': results['params'][idx].get('rf__max_depth'),
            'min_samples_split': results['params'][idx].get('rf__min_samples_split'),
            'min_samples_leaf': results['params'][idx].get('rf__min_samples_leaf'),
            'criterion': results['params'][idx].get('rf__criterion'),
            'accuracy': results.get('mean_test_accuracy', [None])[idx],
            'precision': results.get('mean_test_precision', [None])[idx],
            'recall': results.get('mean_test_recall', [None])[idx],
            'f1': results.get('mean_test_f1', [None])[idx],
            'mcc': results.get('mean_test_mcc', [None])[idx],
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile_folds = os.path.join(outDir, f"rf-smote-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile_folds, index=False)

    print(f"Per-fold metrics saved to: {outFile_folds}")

    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile_final = os.path.join(outDir, f"rf-results-{n_splits}-folds.csv")
    with open(outFile_final, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"Random Forest analysis completed for {n_splits}-folds with SMOTE. Results saved to: {outFile_final}")

    return best_params, final_f1, final_mcc

###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "smote-results"
os.makedirs(outDir, exist_ok=True)

# Run Random Forest with SMOTE and PCA using 5-fold cross-validation
print("\nStarting Random Forest analysis with SMOTE for 5-fold cross-validation...")
best_params_5folds, best_f1_5folds, final_mcc_5folds = runRFWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for Random Forest with SMOTE 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting Random Forest analysis with SMOTE for 5-fold cross-validation...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters with SMOTE and PCA: {'rf__criterion': 'entropy', 'rf__max_depth': 20, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100}
Best F1 Score from cross-validation: 0.822015823873409
Per-fold metrics saved to: smote-results\rf-smote-fold-results-5-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.8238888888888889
Final Recall: 0.8222222222222222
Final Accuracy: 0.9444041137326075
Final F1 Score: 0.822015823873409
Final MCC: 0.7898138504372108
Random Forest analysis completed for 5-folds with SMOTE. Results saved to: smote-results\rf-results-5-folds.csv

Best results for Random Forest with SMOTE 5-fold cross-validation:
Best Parameters: {'rf__criterion': 'entropy', 'rf__max_depth': 20, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 100}
Best F1 Score: 0.822015823873409
Final 

## Decision Tree

In [15]:

import os
import time
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_extraction.text import CountVectorizer

###############################################################################
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

###############################################################################
# Decision Tree with SMOTE and PCA

def runDTWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()

    # Define scoring metrics
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score, zero_division=1),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score, zero_division=1),
        'mcc': mcc_scorer
    }

    # Define a pipeline with SMOTE, and Decision Tree
    pipeline = ImbPipeline([
        ('vectorizer', CountVectorizer(stop_words=None)),  # Vectorizer
        ('smote', SMOTE(random_state=42)),                 # SMOTE for oversampling
        ('dt', DecisionTreeClassifier(random_state=42))    # Decision Tree classifier
    ])

    # Define parameter grid for GridSearchCV
    param_grid = {
        'dt__max_depth': [10, 20, 30],                     # Maximum depth of the tree
        'dt__min_samples_split': [5, 10],                  # Minimum number of samples required to split a node
        'dt__min_samples_leaf': [2, 5],                    # Minimum number of samples required at a leaf node
        'dt__criterion': ['gini', 'entropy'],              # Function to measure the quality of a split
        'dt__max_features': [None, 'sqrt', 'log2']         # Controls how many features to consider for splits
    }

    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Setup GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=skf, scoring=scoring,
        refit='f1', verbose=1, return_train_score=True
    )

    # Fit the GridSearchCV to the data
    grid_search.fit(dataPoints, dataLabelsList)

    # Retrieve the best parameters and score from cross-validation
    best_params = grid_search.best_params_
    best_f1_cv = grid_search.best_score_

    print(f"Best Parameters with SMOTE and PCA: {best_params}")
    print(f"Best Score from cross-validation: {best_f1_cv}")

    # Extract the cross-validation results and print final metrics
    results = grid_search.cv_results_

    # Prepare per-fold metrics for CSV along with the parameter combinations
    fold_metrics = []
    for idx in range(len(results['params'])):
        fold_metrics.append({
            'n_components': results['params'][idx].get('pca__n_components'),
            'max_depth': results['params'][idx].get('dt__max_depth'),
            'min_samples_split': results['params'][idx].get('dt__min_samples_split'),
            'min_samples_leaf': results['params'][idx].get('dt__min_samples_leaf'),
            'criterion': results['params'][idx].get('dt__criterion'),
            'dt__max_features': results['params'][idx].get('dt__max_features'),
            'accuracy': results.get('mean_test_accuracy', [None])[idx],
            'precision': results.get('mean_test_precision', [None])[idx],
            'recall': results.get('mean_test_recall', [None])[idx],
            'f1': results.get('mean_test_f1', [None])[idx],
            'mcc': results.get('mean_test_mcc', [None])[idx],
        })

    # Save fold-wise metrics to CSV
    df_folds = pd.DataFrame(fold_metrics)
    outFile_folds = os.path.join(outDir, f"dt-smote-pca-fold-results-{n_splits}-folds.csv")
    df_folds.to_csv(outFile_folds, index=False)

    print(f"Per-fold metrics saved to: {outFile_folds}")

    # Extract final metrics based on the cross-validation results
    final_f1 = results['mean_test_f1'][grid_search.best_index_]
    final_precision = results['mean_test_precision'][grid_search.best_index_]
    final_recall = results['mean_test_recall'][grid_search.best_index_]
    final_accuracy = results['mean_test_accuracy'][grid_search.best_index_]
    final_mcc = results['mean_test_mcc'][grid_search.best_index_]

    # Print final metrics (cross-validation averages)
    print("\nFinal Cross-Validation Metrics:")
    print(f"Final Precision: {final_precision}")
    print(f"Final Recall: {final_recall}")
    print(f"Final Accuracy: {final_accuracy}")
    print(f"Final F1 Score: {final_f1}")
    print(f"Final MCC: {final_mcc}")

    # Save the results to a CSV file
    outFile_final = os.path.join(outDir, f"dt-results-{n_splits}-folds.csv")
    with open(outFile_final, "w") as f:
        f.write("Accuracy,Precision,Recall,F1,MCC\n")
        f.write(f"{final_accuracy},{final_precision},{final_recall},{final_f1},{final_mcc}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds with SMOTE and PCA. Results saved to: {outFile_final}")

    return best_params, final_f1, final_mcc

###############################################################################
# Main Execution for 5-Fold Cross-Validation

outDir = "smote-results"
os.makedirs(outDir, exist_ok=True)

# Run Decision Tree with SMOTE using 5-fold cross-validation
print("\nStarting Decision Tree analysis with SMOTE for 5-fold cross-validation...")
best_params_5folds, best_f1_5folds, final_mcc_5folds = runDTWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for Decision Tree with SMOTE 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting Decision Tree analysis with SMOTE and PCA for 5-fold cross-validation...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters with SMOTE and PCA: {'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 5}
Best Score from cross-validation: 0.8495475113122172
Per-fold metrics saved to: smote-results\dt-smote-pca-fold-results-5-folds.csv

Final Cross-Validation Metrics:
Final Precision: 0.9199999999999999
Final Recall: 0.8444444444444443
Final Accuracy: 0.955111917725348
Final F1 Score: 0.8495475113122172
Final MCC: 0.8454298941250029
Decision Tree analysis completed for 5-folds with SMOTE and PCA. Results saved to: smote-results\dt-results-5-folds.csv

Best results for Decision Tree with SMOTE 5-fold cross-validation:
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 5}
Best F1 Score: 0.84954751

## Decision Tree with Threshold

In [20]:
def runDTWithSMOTE(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    
    # Define parameter grid
    param_grid = {
        'dt__max_depth': [10],
        'dt__min_samples_split': [5],
        'dt__min_samples_leaf': [2],
        'dt__criterion': ['gini'],
        'dt__max_features': [None]
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline with SMOTE and Decision Tree with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('smote', SMOTE(random_state=42)),
                ('dt', DecisionTreeClassifier(
                    max_depth=param_dict['dt__max_depth'],
                    min_samples_split=param_dict['dt__min_samples_split'],
                    min_samples_leaf=param_dict['dt__min_samples_leaf'],
                    criterion=param_dict['dt__criterion'],
                    max_features=param_dict['dt__max_features'],
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            avg_accuracy = np.mean([tm['accuracy'] for tm in threshold_metrics])
            avg_precision = np.mean([tm['precision'] for tm in threshold_metrics])
            avg_recall = np.mean([tm['recall'] for tm in threshold_metrics])
            avg_f1 = np.mean([tm['f1'] for tm in threshold_metrics])
            avg_mcc = np.mean([tm['mcc'] for tm in threshold_metrics])
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'precision': avg_precision,
                'recall': avg_recall,
                'f1': avg_f1,
                'mcc': avg_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"dt-smote-threshold-results-{n_splits}-folds.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    final_mcc = best_result['mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1}")
    print(f"Final MCC: {final_mcc}")
    
    return best_params, best_threshold, best_f1, final_mcc
# Main Execution for 5-Fold Cross-Validation
outDir = "smote-results-DT-new"
os.makedirs(outDir, exist_ok=True)

# Run Decision Tree with SMOTE and Threshold adjustment using 5-fold cross-validation
print("\nStarting Decision Tree analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, final_mcc_5folds = runDTWithSMOTE(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for Decision Tree with SMOTE and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting Decision Tree analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...
Results saved to: smote-results-DT-new\dt-smote-threshold-results-5-folds.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__criterion': 'gini', 'dt__max_features': None}
Best Threshold: 0.1
Best F1 Score: 0.8760130718954249
Final MCC: 0.8639675781078721

Best results for Decision Tree with SMOTE and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__criterion': 'gini', 'dt__max_features': None}
Best Threshold: 0.1
Best F1 Score: 0.8760130718954249
Final MCC: 0.8639675781078721
