In [None]:
# Mount to drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, accuracy_score, mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
path = "/Apple-Invariant-feature"
os.chdir(path)
os.getcwd()

In [None]:
df = pd.read_excel("original_radiomic_features_Apple_Addclass.xlsx", sheet_name=0)
#########################################################
#                GLCM_Features_From_Pyradiomics         #
#                                                       #
#########################################################
df = df[['original_glcm_Autocorrelation', 'original_glcm_ClusterProminence', 'original_glcm_ClusterShade', 'original_glcm_Contrast',
         'original_glcm_Correlation','original_glcm_DifferenceAverage','original_glcm_DifferenceEntropy','original_glcm_DifferenceVariance',
         'original_glcm_JointAverage','original_glcm_JointEnergy','original_glcm_JointEntropy','original_glcm_InverseVariance','original_glcm_Imc2',
         'original_glcm_Idm','original_glcm_Id','original_glcm_Idmn','original_glcm_Idn','original_glcm_Imc1',
         'original_glcm_MCC','original_glcm_MaximumProbability','original_glcm_SumAverage', 'original_glcm_SumEntropy', 'original_glcm_SumSquares', 'original_glcm_ClusterTendency', 'n_class']]

print(df.shape)
print(df.columns)

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1:]
print(type(y))

In [None]:
counts = df['n_class'].value_counts()
total_count = counts.sum()
percentage = (counts / total_count) * 100

print("total counts:", total_count )
print("Counts:")
print(counts)
print("\nPercentages:")
print(percentage)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y.iloc[:, 0], random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
count1 = y_test.iloc[:, 0].value_counts()[0]
count2 = y_test.iloc[:, 0].value_counts()[1]
print(count1)
print(count2)

# Original GLCM Features

In [None]:
"""
  ada-boosting - stratified sampling
"""

max_depth_range = np.arange(1, 20)

train_accuracies_ada = []
test_accuracies_ada = []
train_losses_ada = []
test_losses_ada = []

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

for max_depth in max_depth_range:
    train_accuracy = []
    test_accuracy = []
    train_loss = []
    test_loss = []

    for train_index, val_index in kf.split(X_train):
        X_train1, X_val1 = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train1, y_val1 = y_train.iloc[train_index], y_train.iloc[val_index]

        estimator = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=5, criterion='squared_error', random_state=42)
        ada_boost = AdaBoostRegressor(estimator=estimator, n_estimators=40, learning_rate=0.005, loss='square', random_state=42)
        model = MultiOutputRegressor(ada_boost)
        model.fit(X_train1, y_train1)

        train_accuracy.append(accuracy_score(y_train1.iloc[:, 0], (model.predict(X_train1)[:, 0] > 0.5).astype(int)))
        test_accuracy.append(accuracy_score(y_test.iloc[:, 0], (model.predict(X_test)[:, 0] > 0.5).astype(int)))
        train_loss.append(mean_absolute_error(y_train1, model.predict(X_train1)))
        test_loss.append(mean_absolute_error(y_test, model.predict(X_test)))

    train_accuracies_ada.append(np.mean(train_accuracy))
    test_accuracies_ada.append(np.mean(test_accuracy))
    train_losses_ada.append(np.mean(train_loss))
    test_losses_ada.append(np.mean(test_loss))

plt.plot(max_depth_range, train_accuracies_ada, label='training')
plt.plot(max_depth_range, test_accuracies_ada, label='testing')

plt.xticks(np.arange(min(max_depth_range), max(max_depth_range)+1, 1, dtype=int))

plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.title('AdaBoosting')
plt.show()

plt.plot(max_depth_range, train_losses_ada, label='training')
plt.plot(max_depth_range, test_losses_ada, label='testing')

plt.xticks(np.arange(min(max_depth_range), max(max_depth_range)+1, 1, dtype=int))

plt.xlabel('Depth')
plt.ylabel('Loss')
plt.legend()
plt.title('AdaBoosting')
plt.show()

In [None]:
from sklearn.utils import resample

optimal_depth = 13

train_accuracy_optimal = []
test_accuracy_optimal = []
train_loss_optimal = []
test_loss_optimal = []

for train_index, val_index in kf.split(X_train):
    X_train1_optimal, X_val1_optimal = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train1_optimal, y_val1_optimal = y_train.iloc[train_index], y_train.iloc[val_index]

    estimator = DecisionTreeRegressor(max_depth=optimal_depth, min_samples_split=5, criterion='squared_error', random_state=42)
    ada_boost = AdaBoostRegressor(estimator=estimator, n_estimators=40, learning_rate=0.005, loss='square', random_state=42)
    model_GLCM_optimal = MultiOutputRegressor(ada_boost)
    model_GLCM_optimal.fit(X_train1_optimal, y_train1_optimal)

    train_predictions_optimal = (model_GLCM_optimal.predict(X_train1_optimal)[:, 0] > 0.5).astype(int)
    test_predictions_optimal = (model_GLCM_optimal.predict(X_test)[:, 0] > 0.5).astype(int)

    train_accuracy_optimal.append(accuracy_score(y_train1_optimal.iloc[:, 0], train_predictions_optimal))
    test_accuracy_optimal.append(accuracy_score(y_test.iloc[:, 0], test_predictions_optimal))
    train_loss_optimal.append(mean_absolute_error(y_train1_optimal, model.predict(X_train1_optimal)))
    test_loss_optimal.append(mean_absolute_error(y_test, model.predict(X_test)))

train_accuracy_final = np.mean(train_accuracy_optimal)
test_accuracy_final = np.mean(test_accuracy_optimal)
train_loss_final = np.mean(train_loss)
test_loss_final = np.mean(test_loss)

# Compute sensitivity and specificity
tn, fp, fn, tp = confusion_matrix(y_test.iloc[:, 0], test_predictions_optimal).ravel()
sensitivity_GLCM_optimal = tp / (tp + fn)
specificity_GLCM_optimal = tn / (tn + fp)

# Compute AUC score
auc_score_GLCM_optimal = roc_auc_score(y_test.iloc[:, 0], model_GLCM_optimal.predict(X_test)[:, 0])

In [None]:
print("Accuracy: {:.2f}".format(test_accuracy_final))
print("Sensitivity: {:.2f}".format(sensitivity_GLCM_optimal))
print("Specificity: {:.2f}".format(specificity_GLCM_optimal))

In [None]:
# Number of bootstrap samples
n_bootstrap_samples = 1000

sensitivity_values = []
specificity_values = []
auc_values = []
accuracy_values = []

for _ in range(n_bootstrap_samples):
    # Resample the test dataset with replacement
    resampled_indices = np.random.choice(len(y_test), len(y_test), replace=True)
    y_test_resampled = y_test.iloc[resampled_indices]
    test_predictions_resampled = test_predictions_optimal[resampled_indices]

    # Calculate accuracy, sensitivity and specificity for the resampled dataset
    tn, fp, fn, tp = confusion_matrix(y_test_resampled.iloc[:, 0], test_predictions_resampled).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    # Calculate AUC for the resampled dataset
    auc = roc_auc_score(y_test_resampled.iloc[:, 0], test_predictions_resampled)

    # Store the sensitivity, specificity, and AUC values
    sensitivity_values.append(sensitivity)
    specificity_values.append(specificity)
    accuracy_values.append(accuracy)
    auc_values.append(auc)

# Calculate 95% confidence intervals (percentiles) for accuracy, sensitivity, specificity, and AUC
sensitivity_ci = np.percentile(sensitivity_values, [2.5, 97.5])
specificity_ci = np.percentile(specificity_values, [2.5, 97.5])
accuracy_ci = np.percentile(accuracy_values, [2.5, 97.5])
auc_ci = np.percentile(auc_values, [2.5, 97.5])

print("Accuracy 95% CI:", accuracy_ci)
print("Sensitivity 95% CI:", sensitivity_ci)
print("Specificity 95% CI:", specificity_ci)
print("AUC 95% CI:", auc_ci)

In [None]:
import matplotlib.colors as mcolors

cm = confusion_matrix(y_test.iloc[:, 0], test_predictions_optimal)
class_labels = ['Rotten', 'Fresh']

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', cbar=False, xticklabels=class_labels, yticklabels=class_labels, annot_kws={"size": 25, "weight": "bold"})
ax = plt.gca()
ax.set_xticklabels(class_labels, fontsize=16, weight='bold')
ax.set_yticklabels(class_labels, fontsize=16, weight='bold')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test.iloc[:, 0], model_GLCM_optimal.predict(X_test)[:, 0])

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(auc_score_GLCM_optimal))
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Predictor Gain Calculation

feature_importances = np.zeros(X_train1_optimal.shape[1])

# Iterate over all the decision trees in the AdaBoost ensemble
for tree in model_GLCM_optimal.estimators_:
    decision_tree_model = tree.estimator
    decision_tree_model.fit(X_train1_optimal, y_train1_optimal)
    tree_importances = decision_tree_model.feature_importances_
    feature_importances += tree_importances

# Normalize the importances to sum up to 100%
importance_percentage = (feature_importances / feature_importances.sum()) * 100

# Modify the predictor names to remove "original_" part
predictor_names = [col.replace('original_glcm_', '') for col in df.columns[:-1]]

# Create a DataFrame to store the feature importances and their names
importance_df = pd.DataFrame({
    'Predictor': predictor_names,
    'Importance (%)': importance_percentage
})

# Sort the DataFrame based on importance in descending order
importance_df = importance_df.sort_values(by='Importance (%)', ascending=False)

# Reset the index of the DataFrame for a cleaner representation
importance_df.reset_index(drop=True, inplace=True)

print(importance_df)

In [None]:
# Plot the feature importances as a horizontal bar plot
colors = ['red' if i < 6 else 'blue' for i in range(importance_df.shape[0])]

plt.figure(figsize=(8, 13))
plt.barh(importance_df['Predictor'], importance_df['Importance (%)'],  color=colors)
plt.xlabel('Gain', fontsize=20, fontweight='bold')
plt.ylabel('Radiomics Metric', fontsize=20, fontweight='bold')
#plt.title('Feature Importances')
plt.gca().invert_yaxis()  # Invert the y-axis to display the most important feature at the top
plt.show()

# Haralick GLCM Features

In [None]:
df_GLCM_Invariant = pd.read_excel("GLCM_Invariant_Features_Apple_Addclass.xlsx", sheet_name=0)
df_GLCM_Invariant = df_GLCM_Invariant.drop('RecordName', axis=1)
print(df_GLCM_Invariant.shape)
print(df_GLCM_Invariant.columns)

In [None]:
nan_rows, nan_columns = np.where(pd.isnull(df_GLCM_Invariant))
df_GLCM_Invariant = df_GLCM_Invariant.drop(df_GLCM_Invariant.index[nan_rows])
print(df_GLCM_Invariant.shape)

In [None]:
X_GLCM_Invariant = df_GLCM_Invariant.iloc[:, :-1]
y_GLCM_Invariant = df_GLCM_Invariant.iloc[:, -1:]

In [None]:
counts = df_GLCM_Invariant['n_class'].value_counts()
total_count = counts.sum()
percentage = (counts / total_count) * 100

print("total counts:", total_count )
print("Counts:")
print(counts)
print("\nPercentages:")
print(percentage)

In [None]:
X_train_Haralick, X_test_Haralick, y_train_Haralick, y_test_Haralick = train_test_split(X_GLCM_Invariant, y_GLCM_Invariant, test_size=0.2, stratify=y_GLCM_Invariant.iloc[:, 0], random_state=42)
print(X_train_Haralick.shape)
print(y_train_Haralick.shape)
print(X_test_Haralick.shape)
print(y_test_Haralick.shape)

In [None]:
count1 = y_test_Haralick.iloc[:, 0].value_counts()[0]
count2 = y_test_Haralick.iloc[:, 0].value_counts()[1]
print(count1)
print(count2)

In [None]:
"""
  ada-boosting - stratified sampling
"""

max_depth_range = np.arange(1, 20)

train_accuracies_ada_GLCM_HaralickInvariant = []
test_accuracies_ada_GLCM_HaralickInvariant = []
train_losses_ada_GLCM_HaralickInvariant = []
test_losses_ada_GLCM_HaralickInvariant = []

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

for max_depth in max_depth_range:
    train_accuracy = []
    test_accuracy = []
    train_loss = []
    test_loss = []
    for train_index, val_index in kf.split(X_train_Haralick):
        X_train1_GLCM_HaralickInvariant, X_val1_GLCM_HaralickInvariant = X_train_Haralick.iloc[train_index], X_train_Haralick.iloc[val_index]
        y_train1_GLCM_HaralickInvariant, y_val1_GLCM_HaralickInvariant = y_train_Haralick.iloc[train_index], y_train_Haralick.iloc[val_index]

        estimator = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=5, criterion='squared_error', random_state=42)
        ada_boost = AdaBoostRegressor(estimator=estimator, n_estimators=100, learning_rate=0.002, loss='square', random_state=42)
        model_GLCM_HaralickInvariant = MultiOutputRegressor(ada_boost)
        model_GLCM_HaralickInvariant.fit(X_train1_GLCM_HaralickInvariant, y_train1_GLCM_HaralickInvariant)

        train_accuracy.append(accuracy_score(y_train1_GLCM_HaralickInvariant.iloc[:, 0], (model_GLCM_HaralickInvariant.predict(X_train1_GLCM_HaralickInvariant)[:, 0] > 0.5).astype(int)))
        test_accuracy.append(accuracy_score(y_test_Haralick.iloc[:, 0], (model_GLCM_HaralickInvariant.predict(X_test_Haralick)[:, 0] > 0.5).astype(int)))
        train_loss.append(mean_absolute_error(y_train1_GLCM_HaralickInvariant, model_GLCM_HaralickInvariant.predict(X_train1_GLCM_HaralickInvariant)))
        test_loss.append(mean_absolute_error(y_test_Haralick, model_GLCM_HaralickInvariant.predict(X_test_Haralick)))

    train_accuracies_ada_GLCM_HaralickInvariant.append(np.mean(train_accuracy))
    test_accuracies_ada_GLCM_HaralickInvariant.append(np.mean(test_accuracy))
    train_losses_ada_GLCM_HaralickInvariant.append(np.mean(train_loss))
    test_losses_ada_GLCM_HaralickInvariant.append(np.mean(test_loss))

plt.plot(max_depth_range, train_accuracies_ada_GLCM_HaralickInvariant, label='training')
plt.plot(max_depth_range, test_accuracies_ada_GLCM_HaralickInvariant, label='testing')

plt.xticks(np.arange(min(max_depth_range), max(max_depth_range)+1, 1, dtype=int))

# Add a dashed line at Depth = 13
#optimal_depth = 13
#plt.axvline(x=optimal_depth, color='r', linestyle='--')

plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.title('AdaBoosting-Haralick-Invariant')
plt.show()

plt.plot(max_depth_range, train_losses_ada_GLCM_HaralickInvariant, label='training')
plt.plot(max_depth_range, test_losses_ada_GLCM_HaralickInvariant, label='testing')

plt.xticks(np.arange(min(max_depth_range), max(max_depth_range)+1, 1, dtype=int))

# Add a dashed line at Depth = 13
#plt.axvline(x=optimal_depth, color='r', linestyle='--')

plt.xlabel('Depth')
plt.ylabel('Loss')
plt.legend()
plt.title('AdaBoosting-Haralick-Invariant')
plt.show()

In [None]:
optimal_depth = 11

train_accuracy_optimal = []
test_accuracy_optimal = []
train_loss_optimal = []
test_loss_optimal = []

for train_index, val_index in kf.split(X_train_Haralick):
  X_train1_GLCM_HaralickInvariant_optimal, X_val1_GLCM_HaralickInvariant_optimal = X_train_Haralick.iloc[train_index], X_train_Haralick.iloc[val_index]
  y_train1_GLCM_HaralickInvariant_optimal, y_val1_GLCM_HaralickInvariant_optimal = y_train_Haralick.iloc[train_index], y_train_Haralick.iloc[val_index]

  estimator = DecisionTreeRegressor(max_depth=optimal_depth, min_samples_split=5, criterion='squared_error', random_state=42)
  ada_boost = AdaBoostRegressor(estimator=estimator, n_estimators=100, learning_rate=0.002, loss='square', random_state=42)
  model_GLCM_HaralickInvariant_optimal = MultiOutputRegressor(ada_boost)
  model_GLCM_HaralickInvariant_optimal.fit(X_train1_GLCM_HaralickInvariant_optimal, y_train1_GLCM_HaralickInvariant_optimal)

  train_predictions_optimal_Haralick = (model_GLCM_HaralickInvariant_optimal.predict(X_train1_GLCM_HaralickInvariant_optimal)[:, 0] > 0.5).astype(int)
  test_predictions_optimal_Haralick = (model_GLCM_HaralickInvariant_optimal.predict(X_test_Haralick)[:, 0] > 0.5).astype(int)

  train_accuracy_optimal.append(accuracy_score(y_train1_GLCM_HaralickInvariant_optimal.iloc[:, 0], (model_GLCM_HaralickInvariant_optimal.predict(X_train1_GLCM_HaralickInvariant_optimal)[:, 0] > 0.5).astype(int)))
  test_accuracy_optimal.append(accuracy_score(y_test_Haralick.iloc[:, 0], (model_GLCM_HaralickInvariant_optimal.predict(X_test_Haralick)[:, 0] > 0.5).astype(int)))
  train_loss_optimal.append(mean_absolute_error(y_train1_GLCM_HaralickInvariant_optimal, model_GLCM_HaralickInvariant_optimal.predict(X_train1_GLCM_HaralickInvariant_optimal)))
  test_loss_optimal.append(mean_absolute_error(y_test_Haralick, model_GLCM_HaralickInvariant_optimal.predict(X_test_Haralick)))

train_accuracy_final = np.mean(train_accuracy_optimal)
test_accuracy_final = np.mean(test_accuracy_optimal)
train_loss_final = np.mean(train_loss)
test_loss_final = np.mean(test_loss)

# Compute sensitivity and specificity
tn, fp, fn, tp = confusion_matrix(y_test_Haralick.iloc[:, 0], (model_GLCM_HaralickInvariant_optimal.predict(X_test_Haralick)[:, 0] > 0.5).astype(int)).ravel()
sensitivity_GLCM_HaralickInvariant_optimal = tp / (tp + fn)
specificity_GLCM_HaralickInvariant_optimal = tn / (tn + fp)

# Compute AUC score
auc_score_HaralickInvariant_GLCM_optimal = roc_auc_score(y_test_Haralick.iloc[:, 0], model_GLCM_HaralickInvariant_optimal.predict(X_test_Haralick)[:, 0])

In [None]:
print("Accuracy: {:.2f}".format(test_accuracy_final))
print("Sensitivity: {:.2f}".format(sensitivity_GLCM_HaralickInvariant_optimal))
print("Specificity: {:.2f}".format(specificity_GLCM_HaralickInvariant_optimal))

In [None]:
n_bootstrap_samples = 1000

sensitivity_values = []
specificity_values = []
auc_values = []
accuracy_values = []

for _ in range(n_bootstrap_samples):
    # Resample the test dataset with replacement
    resampled_indices = np.random.choice(len(y_test_Haralick), len(y_test_Haralick), replace=True)
    y_test_GLCM_Invariant_resampled = y_test_Haralick.iloc[resampled_indices]
    test_predictions_resampled = test_predictions_optimal_Haralick[resampled_indices]

    # Calculate accuracy, sensitivity and specificity for the resampled dataset
    tn, fp, fn, tp = confusion_matrix(y_test_GLCM_Invariant_resampled.iloc[:, 0], test_predictions_resampled).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    # Calculate AUC for the resampled dataset
    auc = roc_auc_score(y_test_GLCM_Invariant_resampled.iloc[:, 0], test_predictions_resampled)

    # Store the accuracy, sensitivity, specificity, and AUC values
    sensitivity_values.append(sensitivity)
    specificity_values.append(specificity)
    auc_values.append(auc)
    accuracy_values.append(accuracy)

# Calculate 95% confidence intervals (percentiles) for accuracy, sensitivity, specificity, and AUC
accuracy_ci = np.percentile(accuracy_values, [2.5, 97.5])
sensitivity_ci = np.percentile(sensitivity_values, [2.5, 97.5])
specificity_ci = np.percentile(specificity_values, [2.5, 97.5])
auc_ci = np.percentile(auc_values, [2.5, 97.5])

print("Accuracy 95% CI:", accuracy_ci)
print("Sensitivity 95% CI:", sensitivity_ci)
print("Specificity 95% CI:", specificity_ci)
print("AUC 95% CI:", auc_ci)

In [None]:
cm = confusion_matrix(y_test_Haralick.iloc[:, 0], (model_GLCM_HaralickInvariant_optimal.predict(X_test_Haralick)[:, 0] > 0.5).astype(int))
class_labels = ['Rotten', 'Fresh']

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', cbar=False, xticklabels=class_labels, yticklabels=class_labels, annot_kws={"size": 25, "weight": "bold"})
ax = plt.gca()
ax.set_xticklabels(class_labels, fontsize=16, weight='bold')
ax.set_yticklabels(class_labels, fontsize=16, weight='bold')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test_Haralick.iloc[:, 0], model_GLCM_HaralickInvariant_optimal.predict(X_test_Haralick)[:, 0])

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(auc_score_HaralickInvariant_GLCM_optimal))
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Predictor Gain Calculation
feature_importances = np.zeros(X_train1_GLCM_HaralickInvariant_optimal.shape[1])

# Iterate over all the decision trees in the AdaBoost ensemble
for tree in model_GLCM_HaralickInvariant_optimal.estimators_:
    decision_tree_model = tree.estimator
    decision_tree_model.fit(X_train1_GLCM_HaralickInvariant_optimal, y_train1_GLCM_HaralickInvariant_optimal)
    tree_importances = decision_tree_model.feature_importances_
    feature_importances += tree_importances

# Normalize the importances to sum up to 100%
importance_percentage = (feature_importances / feature_importances.sum()) * 100

# Modify the predictor names to remove "original_" part
predictor_names = [col.replace('original_', '') for col in df_GLCM_Invariant.columns[:-1]]

# Create a DataFrame to store the feature importances and their names
importance_df = pd.DataFrame({
    'Predictor': predictor_names,
    'Importance (%)': importance_percentage
})

# Sort the DataFrame based on importance in descending order
importance_df = importance_df.sort_values(by='Importance (%)', ascending=False)

# Reset the index of the DataFrame for a cleaner representation
importance_df.reset_index(drop=True, inplace=True)

print(importance_df)

In [None]:
# Plot the feature importances as a horizontal bar plot
colors = ['red' if i < 6 else 'blue' for i in range(importance_df.shape[0])]

plt.figure(figsize=(8, 13))
plt.barh(importance_df['Predictor'], importance_df['Importance (%)'],  color=colors)
plt.xlabel('Gain', fontsize=20, fontweight='bold')
plt.ylabel('Radiomics Metric', fontsize=20, fontweight='bold')
#plt.title('Feature Importances')
plt.gca().invert_yaxis()  # Invert the y-axis to display the most important feature at the top
plt.show()

# Invariant GLCM Features

In [None]:
df_Empirical_GLCM_Invariant = pd.read_excel("Empirical_Invariant_Feature_Apple_Addclass.xlsx", sheet_name=0)
df_Empirical_GLCM_Invariant = df_Empirical_GLCM_Invariant.drop('Unnamed: 0', axis=1)
df_Empirical_GLCM_Invariant = df_Empirical_GLCM_Invariant[['original_glcm_Autocorrelation', 'original_glcm_ClusterProminence', 'original_glcm_ClusterShade', 'original_glcm_ClusterTendency',
         'original_glcm_Contrast','original_glcm_Correlation','original_glcm_DifferenceAverage','original_glcm_DifferenceEntropy',
         'original_glcm_DifferenceVariance','original_glcm_Id','original_glcm_Idm','original_glcm_Idmn','original_glcm_Idn',
         'original_glcm_Imc1','original_glcm_Imc2','original_glcm_InverseVariance','original_glcm_JointAverage','original_glcm_JointEnergy',
         'original_glcm_JointEntropy','original_glcm_MCC','original_glcm_MaximumProbability','original_glcm_SumAverage','original_glcm_SumEntropy','original_glcm_SumSquares','n_class']]
print(df_Empirical_GLCM_Invariant.shape)
print(df_Empirical_GLCM_Invariant.columns)

In [None]:
nan_rows, nan_columns = np.where(pd.isnull(df_Empirical_GLCM_Invariant))
df_Empirical_GLCM_Invariant = df_Empirical_GLCM_Invariant.drop(df_Empirical_GLCM_Invariant.index[nan_rows])
print(df_Empirical_GLCM_Invariant.shape)

In [None]:
X_GLCM_EmpiricalInvariant = df_Empirical_GLCM_Invariant.iloc[:, :-1]
y_GLCM_EmpiricalInvariant = df_Empirical_GLCM_Invariant.iloc[:, -1:]

In [None]:
X_train_GLCM_Empirical_Invariant, X_test_GLCM_Empirical_Invariant, y_train_GLCM_Empirical_Invariant, y_test_GLCM_Empirical_Invariant = train_test_split(X_GLCM_EmpiricalInvariant, \
                                                                                                                                                        y_GLCM_EmpiricalInvariant, \
                                                                                                                                                        test_size=0.2, \
                                                                                                                                                        stratify=y_GLCM_EmpiricalInvariant.iloc[:, 0], \
                                                                                                                                                        random_state=42)
print(X_train_GLCM_Empirical_Invariant.shape)
print(y_train_GLCM_Empirical_Invariant.shape)
print(X_test_GLCM_Empirical_Invariant.shape)
print(y_test_GLCM_Empirical_Invariant.shape)

In [None]:
count1 = y_test_GLCM_Empirical_Invariant.iloc[:, 0].value_counts()[0]
count2 = y_test_GLCM_Empirical_Invariant.iloc[:, 0].value_counts()[1]
print(count1)
print(count2)

In [None]:
"""
  ada-boosting
"""

max_depth_range = np.arange(1, 20)

train_accuracies_ada_GLCM_EmpiricalInvariant = []
test_accuracies_ada_GLCM_EmpiricalInvariant = []
train_losses_ada_GLCM_EmpiricalInvariant = []
test_losses_ada_GLCM_EmpiricalInvariant = []

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

for max_depth in max_depth_range:
    train_accuracy = []
    test_accuracy = []
    train_loss = []
    test_loss = []
    for train_index, val_index in kf.split(X_train_GLCM_Empirical_Invariant):
        X_train1_GLCM_Empirical_Invariant, X_val1_GLCM_Empirical_Invariant = X_train_GLCM_Empirical_Invariant.iloc[train_index], X_train_GLCM_Empirical_Invariant.iloc[val_index]
        y_train1_GLCM_Empirical_Invariant, y_val1_GLCM_Empirical_Invariant = y_train_GLCM_Empirical_Invariant.iloc[train_index], y_train_GLCM_Empirical_Invariant.iloc[val_index]

        estimator = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=2, criterion='squared_error', random_state=42)
        ada_boost = AdaBoostRegressor(estimator=estimator, n_estimators=200, learning_rate=0.002, loss='square', random_state=42)
        model_GLCM_EmpiricalInvariant = MultiOutputRegressor(ada_boost)
        model_GLCM_EmpiricalInvariant.fit(X_train1_GLCM_Empirical_Invariant, y_train1_GLCM_Empirical_Invariant)

        train_accuracy.append(accuracy_score(y_train1_GLCM_Empirical_Invariant.iloc[:, 0], (model_GLCM_EmpiricalInvariant.predict(X_train1_GLCM_Empirical_Invariant)[:, 0] > 0.5).astype(int)))
        test_accuracy.append(accuracy_score(y_test_GLCM_Empirical_Invariant.iloc[:, 0], (model_GLCM_EmpiricalInvariant.predict(X_test_GLCM_Empirical_Invariant)[:, 0] > 0.5).astype(int)))
        train_loss.append(mean_absolute_error(y_train1_GLCM_Empirical_Invariant, model_GLCM_EmpiricalInvariant.predict(X_train1_GLCM_Empirical_Invariant)))
        test_loss.append(mean_absolute_error(y_test_GLCM_Empirical_Invariant, model_GLCM_EmpiricalInvariant.predict(X_test_GLCM_Empirical_Invariant)))

    train_accuracies_ada_GLCM_EmpiricalInvariant.append(np.mean(train_accuracy))
    test_accuracies_ada_GLCM_EmpiricalInvariant.append(np.mean(test_accuracy))
    train_losses_ada_GLCM_EmpiricalInvariant.append(np.mean(train_loss))
    test_losses_ada_GLCM_EmpiricalInvariant.append(np.mean(test_loss))

plt.plot(max_depth_range, train_accuracies_ada_GLCM_EmpiricalInvariant, label='training')
plt.plot(max_depth_range, test_accuracies_ada_GLCM_EmpiricalInvariant, label='testing')

plt.xticks(np.arange(min(max_depth_range), max(max_depth_range)+1, 1, dtype=int))

# Add a dashed line at Depth = 13
#optimal_depth = 13
#plt.axvline(x=optimal_depth, color='r', linestyle='--')

plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.title('AdaBoosting-Empirical-Invariant')
plt.show()

plt.plot(max_depth_range, train_losses_ada_GLCM_EmpiricalInvariant, label='training')
plt.plot(max_depth_range, test_losses_ada_GLCM_EmpiricalInvariant, label='testing')

plt.xticks(np.arange(min(max_depth_range), max(max_depth_range)+1, 1, dtype=int))

# Add a dashed line at Depth = 13
#plt.axvline(x=optimal_depth, color='r', linestyle='--')

plt.xlabel('Depth')
plt.ylabel('Loss')
plt.legend()
plt.title('AdaBoosting-Empirical-Invariant')
plt.show()

In [None]:
optimal_depth = 13

train_accuracy_optimal = []
test_accuracy_optimal = []
train_loss_optimal = []
test_loss_optimal = []

for train_index, val_index in kf.split(X_train_GLCM_Empirical_Invariant):
  X_train1_GLCM_EmpiricalInvariant_optimal, X_val1_GLCM_EmpiricalInvariant_optimal = X_train_GLCM_Empirical_Invariant.iloc[train_index], X_train_GLCM_Empirical_Invariant.iloc[val_index]
  y_train1_GLCM_EmpiricalInvariant_optimal, y_val1_GLCM_EmpiricalInvariant_optimal = y_train_GLCM_Empirical_Invariant.iloc[train_index], y_train_GLCM_Empirical_Invariant.iloc[val_index]

  estimator = DecisionTreeRegressor(max_depth=optimal_depth, min_samples_split=2, criterion='squared_error', random_state=42)
  ada_boost = AdaBoostRegressor(estimator=estimator, n_estimators=200, learning_rate=0.002, loss='square', random_state=42)
  model_GLCM_EmpiricalInvariant_optimal = MultiOutputRegressor(ada_boost)
  model_GLCM_EmpiricalInvariant_optimal.fit(X_train1_GLCM_EmpiricalInvariant_optimal, y_train1_GLCM_EmpiricalInvariant_optimal)

  train_predictions_optimal_Empirical = (model_GLCM_EmpiricalInvariant_optimal.predict(X_train1_GLCM_EmpiricalInvariant_optimal)[:, 0] > 0.5).astype(int)
  test_predictions_optimal_Empirical = (model_GLCM_EmpiricalInvariant_optimal.predict(X_test_GLCM_Empirical_Invariant)[:, 0] > 0.5).astype(int)

  train_accuracy_optimal.append(accuracy_score(y_train1_GLCM_EmpiricalInvariant_optimal.iloc[:, 0], (model_GLCM_EmpiricalInvariant_optimal.predict(X_train1_GLCM_EmpiricalInvariant_optimal)[:, 0] > 0.5).astype(int)))
  test_accuracy_optimal.append(accuracy_score(y_test_GLCM_Empirical_Invariant.iloc[:, 0], (model_GLCM_EmpiricalInvariant_optimal.predict(X_test_GLCM_Empirical_Invariant)[:, 0] > 0.5).astype(int)))
  train_loss_optimal.append(mean_absolute_error(y_train1_GLCM_EmpiricalInvariant_optimal, model_GLCM_EmpiricalInvariant_optimal.predict(X_train1_GLCM_EmpiricalInvariant_optimal)))
  test_loss_optimal.append(mean_absolute_error(y_test_GLCM_Empirical_Invariant, model_GLCM_EmpiricalInvariant_optimal.predict(X_test_GLCM_Empirical_Invariant)))

train_accuracy_final = np.mean(train_accuracy_optimal)
test_accuracy_final = np.mean(test_accuracy_optimal)
train_loss_final = np.mean(train_loss)
test_loss_final = np.mean(test_loss)

# Compute sensitivity and specificity
tn, fp, fn, tp = confusion_matrix(y_test_GLCM_Empirical_Invariant.iloc[:, 0], (model_GLCM_EmpiricalInvariant_optimal.predict(X_test_GLCM_Empirical_Invariant)[:, 0] > 0.5).astype(int)).ravel()
sensitivity_GLCM_EmpiricalInvariant_optimal = tp / (tp + fn)
specificity_GLCM_EmpiricalInvariant_optimal = tn / (tn + fp)

# Compute AUC score
auc_score_EmpiricalInvariant_GLCM_optimal = roc_auc_score(y_test_GLCM_Empirical_Invariant.iloc[:, 0], model_GLCM_EmpiricalInvariant_optimal.predict(X_test_GLCM_Empirical_Invariant)[:, 0])

In [None]:
print("Accuracy: {:.2f}".format(test_accuracy_final))
print("Sensitivity: {:.2f}".format(sensitivity_GLCM_EmpiricalInvariant_optimal))
print("Specificity: {:.2f}".format(specificity_GLCM_EmpiricalInvariant_optimal))
print("AUC: {:.2f}".format(auc_score_EmpiricalInvariant_GLCM_optimal))

In [None]:
cm = confusion_matrix(y_test_GLCM_Empirical_Invariant.iloc[:, 0], (model_GLCM_EmpiricalInvariant_optimal.predict(X_test_GLCM_Empirical_Invariant)[:, 0] > 0.5).astype(int))
class_labels = ['Rotten', 'Fresh']

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', cbar=False, xticklabels=class_labels, yticklabels=class_labels, annot_kws={"size": 25, "weight": "bold"})
ax = plt.gca()
ax.set_xticklabels(class_labels, fontsize=16, weight='bold')
ax.set_yticklabels(class_labels, fontsize=16, weight='bold')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test_GLCM_Empirical_Invariant.iloc[:, 0], model_GLCM_EmpiricalInvariant_optimal.predict(X_test_GLCM_Empirical_Invariant)[:, 0])

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(auc_score_EmpiricalInvariant_GLCM_optimal))
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
sensitivity_values = []
specificity_values = []
auc_values = []
accuracy_values = []

for _ in range(n_bootstrap_samples):
    # Resample the test dataset with replacement
    resampled_indices = np.random.choice(len(y_test_GLCM_Empirical_Invariant), len(y_test_GLCM_Empirical_Invariant), replace=True)
    y_test_GLCM_Empirical_Invariant_resampled = y_test_GLCM_Empirical_Invariant.iloc[resampled_indices]
    test_predictions_resampled = test_predictions_optimal_Empirical[resampled_indices]

    # Calculate accuracy, sensitivity and specificity for the resampled dataset
    tn, fp, fn, tp = confusion_matrix(y_test_GLCM_Empirical_Invariant_resampled.iloc[:, 0], test_predictions_resampled).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    # Calculate AUC for the resampled dataset
    auc = roc_auc_score(y_test_GLCM_Empirical_Invariant_resampled.iloc[:, 0], test_predictions_resampled)

    # Store the accuracy, sensitivity, specificity, and AUC values
    sensitivity_values.append(sensitivity)
    specificity_values.append(specificity)
    auc_values.append(auc)
    accuracy_values.append(accuracy)

# Calculate 95% confidence intervals (percentiles) for accuracy, sensitivity, specificity, and AUC
accuracy_ci = np.percentile(accuracy_values, [2.5, 97.5])
sensitivity_ci = np.percentile(sensitivity_values, [2.5, 97.5])
specificity_ci = np.percentile(specificity_values, [2.5, 97.5])
auc_ci = np.percentile(auc_values, [2.5, 97.5])

print("Accuracy 95% CI:", accuracy_ci)
print("Sensitivity 95% CI:", sensitivity_ci)
print("Specificity 95% CI:", specificity_ci)
print("AUC 95% CI:", auc_ci)

In [None]:
# Predictor Gain Calculation

feature_importances = np.zeros(X_train1_GLCM_EmpiricalInvariant_optimal.shape[1])

# Iterate over all the decision trees in the AdaBoost ensemble
for tree in model_GLCM_EmpiricalInvariant_optimal.estimators_:
    decision_tree_model = tree.estimator
    decision_tree_model.fit(X_train1_GLCM_EmpiricalInvariant_optimal, y_train1_GLCM_EmpiricalInvariant_optimal)
    tree_importances = decision_tree_model.feature_importances_
    feature_importances += tree_importances

# Normalize the importances to sum up to 100%
importance_percentage = (feature_importances / feature_importances.sum()) * 100

# Modify the predictor names to remove "original_" part
predictor_names = [col.replace('original_glcm_', '') for col in df_Empirical_GLCM_Invariant.columns[:-1]]

# Create a DataFrame to store the feature importances and their names
importance_df = pd.DataFrame({
    'Predictor': predictor_names,
    'Importance (%)': importance_percentage
})

# Sort the DataFrame based on importance in descending order
importance_df = importance_df.sort_values(by='Importance (%)', ascending=False)

# Reset the index of the DataFrame for a cleaner representation
importance_df.reset_index(drop=True, inplace=True)

print(importance_df)

In [None]:
# Plot the feature importances as a horizontal bar plot
colors = ['red' if i < 6 else 'blue' for i in range(importance_df.shape[0])]

plt.figure(figsize=(8, 13))
plt.barh(importance_df['Predictor'], importance_df['Importance (%)'],  color=colors)
plt.xlabel('Gain', fontsize=20, fontweight='bold')
plt.ylabel('Radiomics Metric', fontsize=20, fontweight='bold')
#plt.title('Feature Importances')
plt.gca().invert_yaxis()  # Invert the y-axis to display the most important feature at the top
plt.show()

# Comparison

In [None]:
max_accuracy_GLCM = max(test_accuracies_ada)
max_accuracy_GLCM_HaralickInvariant = max(test_accuracies_ada_GLCM_HaralickInvariant)
max_accuracy_GLCM_EmpiricalInvariant = max(test_accuracies_ada_GLCM_EmpiricalInvariant)

headers = pd.MultiIndex.from_tuples([
    ("Original GLCM", "24 features"),
    ("Haralick Invariant GLCM", "24 features"),
    ("Empirical Invariant GLCM", "24 features")
])

df = pd.DataFrame({
    headers[0]: [sensitivity_GLCM_optimal, specificity_GLCM_optimal, auc_score_GLCM_optimal, max_accuracy_GLCM],
    headers[1]: [sensitivity_GLCM_HaralickInvariant_optimal, specificity_GLCM_HaralickInvariant_optimal, auc_score_HaralickInvariant_GLCM_optimal, max_accuracy_GLCM_HaralickInvariant],
    headers[2]: [sensitivity_GLCM_EmpiricalInvariant_optimal, specificity_GLCM_EmpiricalInvariant_optimal, auc_score_EmpiricalInvariant_GLCM_optimal, max_accuracy_GLCM_EmpiricalInvariant]
}, index=["Sensitivity", "Specificity", "AUC", "Accuracy"])

styled_df = df.style.format("{:.2f}")

styles = [
    {'selector': 'td',
     'props': [('border-right', 'solid 1px')]
    }
]

styled_df.set_table_styles(styles)
styled_df