#Prediction of the Development of New Coronary Atherosclerotic Plaques with Radiomics



In [49]:
import os
import pandas as pd
import numpy as np
import warnings
from boruta import BorutaPy

warnings.filterwarnings(action='ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc
# from sklearn.metrics import plot_roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, accuracy_score
from catboost import CatBoostRegressor
import pickle
from colorama import Fore, Back, Style
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from joblib import dump, load
from sklearn.metrics import confusion_matrix
seed = 42
nTopFeatures=15
nFolds = 10
# skf = StratifiedKFold(n_splits=5, suffle=True, random_state=seed)
skf = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=seed)
from sklearn import svm
from imblearn.over_sampling import SMOTE

In [10]:
import pandas as pd
import pickle
from boruta import BorutaPy
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from joblib import dump
from sklearn.model_selection import ParameterGrid

# Set seed for reproducibility
seed = 42

# Set up StratifiedKFold
nFolds = 10
skf = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=seed)


In [2]:
important_features = []

# open the pickle file in binary mode
with open('./plaqueDevelopement/reproducible_features.pkl', 'rb') as f:
    # load the list from the file
    reproducible_features = pickle.load(f)

features = []
for feature in reproducible_features:
    features.append('pre_{}'.format(feature))

data = pd.read_csv("paradigm_radiomics_train.csv")
source = data.filter(regex='pre_original|pre_wavelet')
source = source[features]
target = data['endpoint_a']


# # create a random forest classifier
rf = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=0)

# # create the Boruta feature selector
boruta = BorutaPy(rf, n_estimators='auto', max_iter=10, verbose=0, random_state=seed)

# # fit the Boruta feature selector
boruta.fit(source.values, target.values)

feature_ranking = pd.DataFrame({'feature': source.columns, 'rank': boruta.ranking_})

# # Print the most important features
feature_ranking.to_excel('./plaqueDevelopement/bourta_features_reproduce_analysis.xlsx', index=False)

boruta_features = feature_ranking[feature_ranking['rank']==1]['feature'].values.tolist()


  data = pd.read_csv("paradigm_radiomics_train.csv")


In [3]:
# Train an XGBoost model
model = XGBClassifier(objective='binary:logistic', max_depth=3, learning_rate=0.01, n_estimators=200, random_state=seed)
model.fit(source[boruta_features], target)

# Get the feature importances
importances = model.feature_importances_

# Create a new DataFrame to store the feature importances
feature_importances = pd.DataFrame({'Feature': source[boruta_features].columns, 'Importance': importances})

# Sort the DataFrame by importance in descending order
feature_importances = feature_importances.sort_values('Importance', ascending=False)

# Save the feature importances to an Excel file
feature_importances.to_excel('./plaqueDevelopement/bourta_xgboost_reproduce_analysis.xlsx', index=False)

In [4]:
df = pd.read_excel(
        './plaqueDevelopement/bourta_xgboost_reproduce_analysis.xlsx', index_col=None, names=['Feature', 'Importance']
        )
target_features = df[df['Importance'] != 0]

In [5]:
target_features = target_features['Feature'].values.tolist()

In [6]:
# read the data
train_data = pd.read_csv("paradigm_radiomics_train.csv")

# Split the data into X and y
X_train = train_data[target_features]
y_train = train_data['endpoint_a']

# Split the data into training and test sets
test_data = pd.read_csv("paradigm_radiomics_test.csv")
X_test = test_data[target_features]
y_test = test_data['endpoint_a']

  train_data = pd.read_csv("paradigm_radiomics_train.csv")
  test_data = pd.read_csv("paradigm_radiomics_test.csv")


In [7]:
classifiers = [
    (XGBClassifier(random_state=seed), {
        'max_depth': [3, 4, 5, 8, 10, 12, 15],
        'learning_rate': [0.1, 0.5, 0.01, 0.05],
        'n_estimators': [2, 5, 7, 10]
    })
]

In [8]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [11]:

best_score = 0
for clf, params in classifiers:
    param_grid = ParameterGrid(params)
    for param in param_grid:
        clf.set_params(**param)
        clf.fit(X_train_resampled, y_train_resampled)
        score = roc_auc_score(y_test, clf.predict(X_test))  # AUC score for classification
        if score > best_score:
            best_score = score
            dump(clf, './plaqueDevelopement/XGBClassifier.joblib')