In [24]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

all_df = pd.read_csv('TrainDataset2024.csv', index_col=False)
all_df.drop('ID', axis=1, inplace=True)
all_df.head()

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,144.0,41.0,0,0,0,1,3,3,1,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0,142.0,39.0,1,1,0,0,3,3,1,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1,135.0,31.0,0,0,0,1,2,1,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,12.0,35.0,0,0,0,1,3,3,1,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,109.0,61.0,1,0,0,0,2,1,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [25]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

# classification target
clf_y = all_df['pCR (outcome)']
# regression target
rgr_y = all_df['RelapseFreeSurvival (outcome)']

### Outlier Removal

In [42]:
def percentile_based_outlier(data, threshold=95):
    diff = (100 - threshold) / 2.0
    minval, maxval = np.percentile(data, [diff, 100 - diff])
    return (data < minval) | (data > maxval)

def mad_based_outlier(points, threshold=3.5):
    points = np.array(points)
    if len(points.shape) == 1:
        points = points[:, None]
    median_y = np.median(points)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in points])
    # Small constant added to avoid division by zero
    modified_z_scores = [0.6745 * (y - median_y) / (median_absolute_deviation_y + 1e-6) for y in points]

    return np.abs(modified_z_scores) > threshold

def std_div(data, threshold=3):
    std = data.std()
    mean = data.mean()
    isOutlier = []
    for val in data:
        if abs(val - mean)/std > threshold:
            isOutlier.append(True)
        else:
            isOutlier.append(False)
    return isOutlier

def outlierVote(data):
    x = percentile_based_outlier(data)
    y = mad_based_outlier(data)
    z = std_div(data)
    temp = list(zip(x, y, z))
    final = []
    for i in range(len(temp)):
        if temp[i].count(False) >= 2:
            final.append(False)
        else:
            final.append(True)
    return final

def plotOutliers(x):
    fig, axes = plt.subplots(nrows=4)
    for ax, func in zip(axes, [percentile_based_outlier, mad_based_outlier, std_div, outlierVote]):
        sns.distplot(x, ax=ax, rug=True, hist=False)
        outliers = func(x)
        ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)

    kwargs = dict(y=0.95, x=0.05, ha='left', va='top', size=20)
    axes[0].set_title('Percentile-based Outliers', **kwargs)
    axes[1].set_title('MAD-based Outliers', **kwargs)
    axes[2].set_title('STD-based Outliers', **kwargs)
    axes[3].set_title('Majority vote', **kwargs)
    fig.suptitle('Comparing Outlier Tests with n={}'.format(len(x)), size=20)
    fig = plt.gcf()
    fig.set_size_inches(20, 15)

def removeOutliers(data):
    # Remove outliers from the dataframe
    for column in data.columns:
        outliers = outlierVote(all_df[column])
        # Calculate Non-Outlier Maximum using the outliers list
        non_outlier_max = all_df.loc[~np.array(outliers), column].max()
        # Replace outliers with the maximum non-outlier value
        data.loc[outliers, column] = non_outlier_max

removeOutliers(all_df)
# print(all_df.head())

# Assign features to X
X = all_df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)

### Feature Selection

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2
from sklearn.ensemble import RandomForestClassifier
import numpy as np

scaler = StandardScaler()
Xs = scaler.fit_transform(X)

pca = PCA(n_components=2)

# USING PCA for only the MRI data
Xs_pca = pca.fit_transform(Xs[:, 11:])
Xs = np.hstack((Xs[:, :11], Xs_pca))

from sklearn.feature_selection import SelectKBest, f_classif


mandatory_features = ['ER', 'HER2', 'Gene']
mandatory_features_indices = [X.columns.get_loc(feature) for feature in mandatory_features]

#feature selection to pick top k features using ANOVA F-statistic
k = 10 
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(Xs, clf_y) 

selected_indices = selector.get_support(indices=True)
selected_feature_names = [X.columns[i] for i in selected_indices]

#union of mandatory and selected features, so mandatory features are included
final_selected_indices = list(set(selected_indices).union(mandatory_features_indices))
final_selected_feature_names = [X.columns[i] for i in final_selected_indices]

X_final = X.iloc[:, final_selected_indices]






### KNN & LDA KNN 

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

#Basic K-Nearest Neighbor
def knn_func(X, y):
    # Define the KNN classifier
    knn = KNeighborsClassifier()

    param_grid = {
        'n_neighbors': range(1, 20),
        'weights': ['uniform', 'distance'],  
        'metric': ['euclidean', 'manhattan']  
    }


    grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_final, clf_y)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # print("Best Parameters:", best_params)
    # print("Best Cross-Validation Accuracy:", best_score)

    best_knn = KNeighborsClassifier(**best_params)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(best_knn, X, y, cv=kfold, scoring='accuracy')

    print("KNN cross validation score with best parameters:", cv_scores)
    print("KNN mean cross validation score:", cv_scores.mean())
    print("KNN standard deviation score:", cv_scores.std())



#Use techniques like Linear Discriminant Analysis (LDA), which reduces dimensions while maximizing class separability. (dimentionality is further reduced)
def lda_knn(X, y):
    n_classes = len(np.unique(y))
    n_components = min(X.shape[1], n_classes - 1)  # Ensure n_components is valid
    lda = LDA(n_components=n_components)
    X_lda = lda.fit_transform(X, y)

    knn_lda = KNeighborsClassifier(n_neighbors=16, metric='manhattan', weights='uniform')
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(knn_lda, X_lda, y, cv=kfold, scoring='accuracy')
    print("LDA cross validation scores:", cv_scores)
    print("LDA mean accuracy:", cv_scores.mean())
    print("LDA standard deviation score:", cv_scores.std())


In [48]:
#uncomment to run k-nearest neigbor
# knn_func(X_final,clf_y)

#uncomment to run LDA KNN 
lda_knn(X_final,clf_y)

LDA cross validation scores: [0.8375 0.8    0.775  0.8375 0.85  ]
LDA mean accuracy: 0.82
LDA standard deviation score: 0.02806243040080455
