## <font size="6">10-fold cross-validation in training and all cohorts</font>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import os

def process_file(file_path, output_folder):
    # Data loading
    data = pd.read_csv(file_path)

    # Target Variable Isolation
    X = data.iloc[:, 1:]  # Features start from the second column
    y = data.iloc[:, 0]   # The first column is the target variable

    # Initialize 10-fold cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # Store results for each fold
    auc_scores = []

    # Cross-validation process
    for fold, (train_index, test_index) in enumerate(skf.split(X, y), start=1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y.iloc[test_index]
        
        # SMOTE resampling for the training set only
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        
        # Initialize XGBoost model with parameters
        model = XGBClassifier(
            n_estimators=500,
            colsample_bytree=0.8,
            learning_rate=0.01,
            max_depth=7,
            min_child_weight=10,
            reg_alpha=0.1,
            reg_lambda=1.0,
            subsample=0.9,
            gamma=0.1  
        )
        
        # Train the model
        model.fit(X_train_resampled, y_train_resampled)
        
        # Predict probabilities
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Calculate AUC
        auc = roc_auc_score(y_test, y_pred_proba)
        auc_scores.append({'Fold': fold, 'AUC': auc})

    # Convert results to DataFrame
    results_df = pd.DataFrame(auc_scores)

    # Save to CSV file
    output_path_csv = os.path.join(output_folder, f'AUC_Scores_{os.path.basename(file_path)}.csv')
    results_df.to_csv(output_path_csv, index=False)

    # Extract list of AUC scores
    auc_scores_list = [item['AUC'] for item in auc_scores]

    # Calculate mean AUC and standard deviation
    mean_auc = np.mean(auc_scores_list)
    std_auc = np.std(auc_scores_list)

    # Plot AUC scores
    plt.figure(figsize=(10, 6))
    plt.plot(auc_scores_list, marker='o', linestyle='-', color='b')
    plt.fill_between(range(1, len(auc_scores_list)+1), [mean_auc - std_auc]*len(auc_scores_list), [mean_auc + std_auc]*len(auc_scores_list), color='b', alpha=0.2)
    plt.title(f'10-Fold Cross-Validation AUC Scores of {os.path.basename(file_path)}')
    plt.xlabel('Fold')
    plt.ylabel('AUC Score')
    plt.grid(True)
    plt.savefig(os.path.join(output_folder, f'AUC_Curve_{os.path.basename(file_path)}.svg'))
    plt.close()

# Folder path
folder_path = r'the path of your own data'

# List of file names
file_names = ['All cohorts.csv', 'training.csv']#All features have been normalized

# Output folder path
output_folder = r'C:\Users\Admin\Desktop'

# Process each file
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    process_file(file_path, output_folder)









































## <font size="6">Comparison of combined models and clinical models</font>

In [3]:
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve
from statsmodels.stats.proportion import proportion_confint
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import os

def process_file(file_path):
    # Load data
    data = pd.read_csv(file_path)

    # Target classification variable and two predictor variables
    y = data['label']
    y_pred1 = data.iloc[:, 1]  # Second column variable
    y_pred2 = data.iloc[:, 2]  # Third column variable

    # Calculate AUC
    auc1 = roc_auc_score(y, y_pred1)
    auc2 = roc_auc_score(y, y_pred2)

    # Calculate sensitivity, specificity, positive predictive value, negative predictive value, and their 95% confidence intervals
    fpr1, tpr1, _ = roc_curve(y, y_pred1)
    fpr2, tpr2, _ = roc_curve(y, y_pred2)

    # Calculate optimal threshold (Youden's J statistic)
    j_statistic1 = tpr1 - fpr1
    j_index1 = np.argmax(j_statistic1)
    threshold1 = fpr1[j_index1]

    j_statistic2 = tpr2 - fpr2
    j_index2 = np.argmax(j_statistic2)
    threshold2 = fpr2[j_index2]

    # Calculate sensitivity, specificity, positive predictive value, negative predictive value
    sensitivity1 = tpr1[j_index1]
    specificity1 = 1 - fpr1[j_index1]
    ppv1 = sensitivity1 / (sensitivity1 + (1 - specificity1) * (1 - threshold1))
    npv1 = specificity1 / (specificity1 + threshold1 * (1 - sensitivity1))

    sensitivity2 = tpr2[j_index2]
    specificity2 = 1 - fpr2[j_index2]
    ppv2 = sensitivity2 / (sensitivity2 + (1 - specificity2) * (1 - threshold2))
    npv2 = specificity2 / (specificity2 + threshold2 * (1 - sensitivity2))

    # Calculate 95% confidence intervals
    conf_int1 = proportion_confint(ppv1, ppv1 + npv1, method='wilson')
    conf_int2 = proportion_confint(ppv2, ppv2 + npv2, method='wilson')

    # DeLong test to compare two AUCs
    def delong_test(auc1, auc2, n1, n2):
        se1 = np.sqrt((auc1 * (1 - auc1) / n1) + (auc2 * (1 - auc2) / n2))
        se2 = np.sqrt((auc2 * (1 - auc2) / n2) + (auc1 * (1 - auc1) / n1))
        z = (auc1 - auc2) / np.sqrt(se1**2 + se2**2)
        p = 2 * (1 - norm.cdf(abs(z)))
        return z, p

    z, p = delong_test(auc1, auc2, len(y), len(y))

    # Print results
    print(f'File: {file_path}')
    print(f'AUC1: {auc1:.3f}, 95% CI: {conf_int1}')
    print(f'AUC2: {auc2:.3f}, 95% CI: {conf_int2}')
    print(f'Sensitivity1: {sensitivity1:.3f}, Specificity1: {specificity1:.3f}, PPV1: {ppv1:.3f}, NPV1: {npv1:.3f}')
    print(f'Sensitivity2: {sensitivity2:.3f}, Specificity2: {specificity2:.3f}, PPV2: {ppv2:.3f}, NPV2: {npv2:.3f}')
    print(f'DeLong Test: Z = {z:.3f}, P = {p:.3f}')
    print('---' * 10)

    # Plot ROC curves
    plt.figure(figsize=(8, 8))
    plt.plot(fpr1, tpr1, label=f'Combined model AUC = {auc1:.3f}')
    plt.plot(fpr2, tpr2, label=f'Clinical model AUC = {auc2:.3f}')
    plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curves for {os.path.basename(file_path)}')
    plt.legend(loc='lower right')

    # Save ROC curves to desktop
    desktop_path = os.path.join(os.path.expanduser('~'), 'Desktop')
    roc_plot_path = os.path.join(desktop_path, f'ROC_Curves_{os.path.basename(file_path)}.svg')
    plt.savefig(roc_plot_path)
    plt.close()

    print(f'ROC curves saved to: {roc_plot_path}')

# Folder path
folder_path = r'the path of your own data'

# List of file names
file_names = ['External validation.csv', 'Internal validation.csv', 'Training.csv'] 

# Process each file
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    process_file(file_path)

File: E:\第二次修\Machine learning HCC with MVI\Revised manuscript\Final\Delong test\External validation.csv
AUC1: 0.841, 95% CI: (0.08025553098312749, 0.9031001113478987)
AUC2: 0.653, 95% CI: (0.07268773745536544, 0.9132377184051157)
Sensitivity1: 0.720, Specificity1: 0.873, PPV1: 0.866, NPV1: 0.961
Sensitivity2: 0.780, Specificity2: 0.525, PPV2: 0.758, NPV2: 0.834
DeLong Test: Z = 2.867, P = 0.004
------------------------------
ROC curves saved to: C:\Users\Admin\Desktop\ROC_Curves_External validation.csv.svg
File: E:\第二次修\Machine learning HCC with MVI\Revised manuscript\Final\Delong test\Internal validation.csv
AUC1: 0.862, 95% CI: (0.07460633142706835, 0.9005768856830882)
AUC2: 0.631, 95% CI: (0.07199382184862424, 0.9120450259509878)
Sensitivity1: 0.880, Specificity1: 0.737, PPV1: 0.819, NPV1: 0.959
Sensitivity2: 0.760, Specificity2: 0.561, PPV2: 0.755, NPV2: 0.842
DeLong Test: Z = 2.505, P = 0.012
------------------------------
ROC curves saved to: C:\Users\Admin\Desktop\ROC_Curves_In