In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Install xlrd package
%pip install xlrd

all_df = pd.read_excel('../TrainDataset2024.xls', index_col=False)
all_df.drop('ID', axis=1, inplace=True)
all_df.head()

### Data Imputer

In [None]:
from sklearn.impute import SimpleImputer

# Replace missing values with median of the column
imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

# classification target
clf_y = all_df['pCR (outcome)']
# regression target
rgr_y = all_df['RelapseFreeSurvival (outcome)']

### Outlier Removal

In [None]:
# Outlier removal approach by:
# Thanaki, Jalaj. Machine Learning Solutions : Expert Techniques to Tackle Complex Machine Learning Problems Using Python, Packt Publishing, Limited, 2018. 
# ProQuest Ebook Central, Available at: http://ebookcentral.proquest.com/lib/nottingham/detail.action?docID=5379696.

# Outlier detection using the following methods:
# 1. Percentile based outlier detection
# 2. MAD (median absolute deviation) based outlier detection
# 3. Standard deviation based outlier detection

""" 
    Get all the data points that lie under the percentile range from 2.5 to 97.5
"""
def percentile_based_outlier(data, threshold=95):
    diff = (100 - threshold) / 2.0
    minval, maxval = np.percentile(data, [diff, 100 - diff])
    return (data < minval) | (data > maxval)

"""
    Get all the data points that lie under a threshold of 3.5 using modified Z-score (based on the median absolute deviation)
"""
def mad_based_outlier(points, threshold=3.5):
    points = np.array(points)
    if len(points.shape) == 1:
        points = points[:, None]
    median_y = np.median(points)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in points])
    # Small constant added to avoid division by zero
    modified_z_scores = [0.6745 * (y - median_y) / (median_absolute_deviation_y + 1e-6) for y in points]

    return np.abs(modified_z_scores) > threshold

"""
    Get all the data points that lie under a threshold of 3 using standard deviation
"""
def std_div(data, threshold=3):
    std = data.std()
    mean = data.mean()
    isOutlier = []
    for val in data:
        if abs(val - mean)/std > threshold:
            isOutlier.append(True)
        else:
            isOutlier.append(False)
    return isOutlier

"""
    Perform an outlier voting system to determine if a data point is an outlier. 
    If two of the three methods agree that a data point is an outlier, then it is removed.
"""
def outlierVote(data):
    x = percentile_based_outlier(data)
    y = mad_based_outlier(data)
    z = std_div(data)
    temp = list(zip(x, y, z))
    final = []
    for i in range(len(temp)):
        if temp[i].count(False) >= 2:
            final.append(False)
        else:
            final.append(True)
    return final

def removeOutliers(data):
    # Remove outliers from the dataframe
    for column in data.columns:
        outliers = outlierVote(all_df[column])
        # Calculate Non-Outlier Maximum and minimum using the outliers list
        non_outlier_max = all_df.loc[~np.array(outliers), column].max()
        non_outlier_min = all_df.loc[~np.array(outliers), column].min()

        # Replace outliers with the maximum or minimum non-outlier value
        for i, outlier in enumerate(outliers):
            if outlier:
                data.loc[i, column] = non_outlier_max if data.loc[i, column] > non_outlier_max else non_outlier_min

# Remove outliers, assign modified features to X and drop the outcome columns
removeOutliers(all_df)
X = all_df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)

### Data Normalisation/Standardisation

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardise features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

### Feature Selection and Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

### Feature Selection and Dimensionality Reduction strategy:
#
# 1. Select the mandatory features ER, HER2 and Gene
# 2. Select the MRI features and apply LDA to them
# 3. Select the top 3 features of the remaining features using Random Forest

# Select the mandatory features
mandatory_features = ['ER', 'HER2', 'Gene']
mandatory_features_indices = [1,3,10]
features_required = Xs[:, mandatory_features_indices]

### Dimensionality Reduction

# Select the MRI features
mri_indices = list(range(11, Xs.shape[1]))
mri = Xs[:, mri_indices]

# Apply LDA to MRI features
lda = LinearDiscriminantAnalysis(n_components=1)
Xs_lda = lda.fit_transform(mri, clf_y)

### Feature Selection

# Remove the MRI features from the feature set
non_mri_features = np.delete(Xs, mri_indices, axis=1)

# Feature selection using Random Forest
rnd_clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
rnd_clf.fit(non_mri_features, clf_y)

# Get the feature importances
importances = rnd_clf.feature_importances_
# Get the indices of the features sorted by importance
selected_indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.title('Feature Importances')
plt.bar(range(non_mri_features.shape[1]), importances[selected_indices], align='center')
plt.xticks(range(non_mri_features.shape[1]), selected_indices)
plt.xlim([-1, non_mri_features.shape[1]])
plt.show()

# Select the top 3 features that are not mandatory features [ER, HER2, Gene]
top_features_indices = [i for i in selected_indices if i not in mandatory_features_indices][:3]
top_features = non_mri_features[:, top_features_indices]

# Combine selected features and LDA transformed feature
Xs = np.hstack((features_required, top_features, Xs_lda))

selected_features_indices = sorted(mandatory_features_indices + [int(i) for i in top_features_indices])
print('Selected Features:', selected_features_indices)


### Split dataset

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(Xs, clf_y, test_size=0.2, random_state=42, stratify=clf_y)

### SVM for Classification

In [None]:
from numpy._core.fromnumeric import mean
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from numpy import std

# this nested cross-validation is used to tune the hyperparameters of the model
# the code was adapted from: 
#  

# configure the cross-validation procedure for the inner loop
cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)

# define the classifier
classifier = SVC(random_state=42)

# define search space of hyperparameters
space = dict()
space['C'] = [0.1, 1, 10]
space['gamma'] = [0.01, 0.1]
space['kernel'] = ["rbf", "linear", "poly"]

# define GridSearch to search for the best hyperparameters
search = GridSearchCV(classifier, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

# configure the cross-validation procedure for the outer loop
cv_outer = KFold(n_splits=5, shuffle=True, random_state=2)

# execute the nested cross-validation
scores = cross_val_score(search, X_train, y_train, scoring='accuracy', cv=cv_outer, n_jobs=-1)

# fit the search on the whole dataset to get the best model
search.fit(X_train, y_train)
model = search.best_estimator_

# report performance and best model configuration
print('Nested CV training Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Best Model:', model)



### Testing model

In [None]:
from sklearn.metrics import classification_report

# Predict using the test set
y_pred = model.predict(X_test)
# Print f1-score, precision, recall and support for prediction using the test set
print(classification_report(y_test, y_pred))
print('Test Accuracy: %.3f' % model.score(X_test, y_test))

In [None]:
from numpy import mean, std
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

# configure the cross-validation procedure
cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)
# define the model
model = RandomForestClassifier(random_state=42)
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)
# configure the cross-validation procedure
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)
# execute the nested cross-validation
scores = cross_val_score(search, X_train, y_train, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# fit the search on the whole dataset to get the best model
search.fit(X_train, y_train)
rf_model = search.best_estimator_
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Best Model:', rf_model)

In [None]:
from sklearn.metrics import classification_report

# Predict using the test set
y_pred = rf_model.predict(X_test)
# Print f1-score, precision, recall and support for prediction using the test set
print(classification_report(y_test, y_pred))
print('Test Accuracy: %.3f' % rf_model.score(X_test, y_test))

In [None]:
from sklearn.linear_model import LogisticRegression

# configure the cross-validation procedure
cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)
# define the model
model = LogisticRegression(random_state=42)
# define search space
space = dict()
space['C'] = [0.1, 1, 10, 100]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)
# configure the cross-validation procedure
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)
# execute the nested cross-validation
scores = cross_val_score(search, X_train, y_train, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# fit the search on the whole dataset to get the best model
search.fit(X_train, y_train)
logistic_model = search.best_estimator_
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Best Model:', logistic_model)

In [None]:
from sklearn.metrics import classification_report

# Predict using the test set
y_pred = logistic_model.predict(X_test)
# Print f1-score, precision, recall and support for prediction using the test set
print(classification_report(y_test, y_pred))
print('Test Accuracy: %.3f' % logistic_model.score(X_test, y_test))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# configure the cross-validation procedure
cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)
# define the model
model = KNeighborsClassifier(weights='uniform')
# define search space
space = dict()
space['n_neighbors'] = [1, 3, 5, 7, 10, 15, 20]
space['metric'] = ['euclidean', 'manhattan', 'minkowski']
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)
# configure the cross-validation procedure
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)
# execute the nested cross-validation
scores = cross_val_score(search, X_train, y_train, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# fit the search on the whole dataset to get the best model
search.fit(X_train, y_train)
knn_model = search.best_estimator_
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Best Model:', knn_model)

In [None]:
from sklearn.metrics import classification_report

# Predict using the test set
y_pred = knn_model.predict(X_test)
# Print f1-score, precision, recall and support for prediction using the test set
print(classification_report(y_test, y_pred))
print('Test Accuracy: %.3f' % knn_model.score(X_test, y_test))

In [15]:
from sklearn.neural_network import MLPClassifier

# configure the cross-validation procedure
cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)
# define the model
model = MLPClassifier(random_state=42, max_iter=1000)
# define search space
space = dict()
space['activation'] = ['identity', 'logistic', 'tanh', 'relu']
space['hidden_layer_sizes'] = [(10,), (50,), (100,), (200,)]
space['solver'] = ['lbfgs', 'sgd', 'adam']
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)
# configure the cross-validation procedure
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)
# execute the nested cross-validation
scores = cross_val_score(search, X_train, y_train, scoring='accuracy', cv=cv_outer, n_jobs=-1)
# fit the search on the whole dataset to get the best model
search.fit(X_train, y_train)
mlp_model = search.best_estimator_
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Best Model:', mlp_model)



Accuracy: 0.863 (0.049)
Best Model: MLPClassifier(activation='logistic', hidden_layer_sizes=(10,), max_iter=1000,
              random_state=42)




In [None]:
from sklearn.metrics import classification_report

# Predict using the test set
y_pred = mlp_model.predict(X_test)
# Print f1-score, precision, recall and support for prediction using the test set
print(classification_report(y_test, y_pred))
print('Test Accuracy: %.3f' % mlp_model.score(X_test, y_test))