In [86]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

train_df = pd.read_excel('../TrainDataset2024.xls', sheet_name='Sheet1')
test_df = pd.read_excel('../TestDatasetExample.xls', sheet_name='Sheet1')
test_df.head()

Unnamed: 0,ID,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002728,56.881588,0,0,0,1,3,3,999,0,...,0.194591,0.194591,2.846439,0.001281,4168474.0,131.044541,0.002335,0.109755,0.013383,0.002051
1,TRG002649,60.0,0,0,1,0,2,1,1,0,...,0.309999,0.309996,2.975317,0.007253,173658.5,23.967478,0.011285,0.05589,0.003163,0.009553
2,TRG002628,58.234086,0,0,0,1,3,3,1,1,...,0.328377,0.328377,3.785966,0.003185,3607821.0,223.279556,0.001334,0.101628,0.010844,0.001194


In [87]:
Y_train = train_df['pCR (outcome)']
X_train = train_df.drop(['ID','pCR (outcome)','RelapseFreeSurvival (outcome)'],axis=1)
X_train.head()

X_test = test_df.drop(['ID'], axis=1)

In [88]:
#outliers replaced with median of the column
import numpy as np
import pandas as pd

def replace_outliers_with_median(data, threshold=2):
    data_cleaned = data.copy()
    
    for col in data.columns:
        if data[col].dtype in [np.float64, np.int64]:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            
            median_value = data[col].median()
            data_cleaned[col] = data_cleaned[col].apply(
                lambda x: median_value if x < lower_bound or x > upper_bound else x
            )
    
    return data_cleaned

X_train_cleaned = replace_outliers_with_median(X_train)
X_test_cleaned = replace_outliers_with_median(X_test)

# Handle missing values
X_train_cleaned = X_train_cleaned.apply(lambda col: col.where(col != 999, col.median()), axis=0)
X_test_cleaned = X_test_cleaned.apply(lambda col: col.where(col != 999, col.median()), axis=0)
mandatory_features = ['ER', 'HER2', 'Gene']


In [89]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(X_train_cleaned, Y_train)
feature_importances = rf_classifier.feature_importances_

importance_threshold = 0.01 
selected_features = X_train_cleaned.columns[feature_importances > importance_threshold]

selected_features = list(set(selected_features) | set(mandatory_features))

X_train_selected = X_train_cleaned[selected_features]
X_test_selected = X_test_cleaned[selected_features]


In [90]:
#normalization and PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

pca = PCA(n_components=0.85)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [91]:
# Decision Tree Classifier with K-fold cross-validation
dt_classifier = DecisionTreeClassifier(random_state=1)
k_fold_scores = cross_val_score(dt_classifier, X_train_pca, Y_train, cv=2, scoring='accuracy')

print(f"K-fold cross-validation scores: {k_fold_scores}")
print(f"Mean K-fold accuracy: {k_fold_scores.mean():.4f}")

# Train on the full training set and make predictions on the test dataset
dt_classifier.fit(X_train_pca, Y_train)
test_predictions = dt_classifier.predict(X_test_pca)


K-fold cross-validation scores: [0.73  0.685]
Mean K-fold accuracy: 0.7075
