In [89]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


all_df = pd.read_csv('../TrainDataset2024.csv', index_col=False)
all_df.drop('ID', axis=1, inplace=True)
all_df.head()

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,144.0,41.0,0,0,0,1,3,3,1,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0,142.0,39.0,1,1,0,0,3,3,1,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1,135.0,31.0,0,0,0,1,2,1,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,12.0,35.0,0,0,0,1,3,3,1,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,109.0,61.0,1,0,0,0,2,1,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [90]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

# classification target
clf_y = all_df['pCR (outcome)']
# regression target
rgr_y = all_df['RelapseFreeSurvival (outcome)']

### Outlier Removal

In [None]:
from utils import outlierVote

# Assign features to X
X = all_df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)

In [None]:
# data segmentation

X_mri = X.iloc[:, 11:]
X_non_mri = X.iloc[:, :11]
mandatory_features = ['ER', 'HER2', 'Gene']
mandatory_features_indices = [X.columns.get_loc(feature) for feature in mandatory_features]

### Feature Selection

In [None]:
# Manifold Learning
from sklearn.manifold import TSNE, Isomap
import matplotlib.pyplot as plt 

tsne = TSNE(n_components=3, random_state=42) 
X_mri_tsne = tsne.fit_transform(X_mri)

# join non mri woth x_tsne
X_tsne = np.c_[X_non_mri, X_mri_tsne]
X_tsne.shape


(400, 14)

In [None]:
# MANIFOLD training
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

Xs_manifold_train, Xs_manifold_test, y_manifold_train, y_manifold_test = train_test_split(X_tsne, rgr_y, test_size=0.2, random_state=42)

svr = SVR(C=6, gamma=0.1, kernel='rbf')
svr.fit(Xs_manifold_train, y_manifold_train)

y_pred = svr.predict(Xs_manifold_test)
mae = np.mean(np.abs(y_manifold_test - y_pred))

mse = mean_squared_error(y_manifold_test, y_pred)
print('Test MAE:', mae)
print('Test MSE:', mse)


Test MAE: 21.45403876425396
Test MSE: 808.7440814650305


In [None]:
# check MRI data normality
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def check_normality(data, alpha=0.05):
    """
    Comprehensive check for normality of features using multiple methods.
    
    Parameters:
    data: pandas DataFrame or numpy array
    alpha: significance level for statistical tests
    
    Returns:
    dict: Results of normality tests for each feature
    """
    # Convert to DataFrame if numpy array
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)
    
    results = {}
    
    for column in data.columns:
        feature_data = data[column].dropna()
        
        # 1. Shapiro-Wilk test
        shapiro_stat, shapiro_p = stats.shapiro(feature_data)
        
        # 2. D'Agostino-Pearson test
        agostino_stat, agostino_p = stats.normaltest(feature_data)
        
        # 3. Basic statistics
        skewness = stats.skew(feature_data)
        kurtosis = stats.kurtosis(feature_data)
        
        # Store results
        results[column] = {
            'shapiro_test': {
                'statistic': shapiro_stat,
                'p_value': shapiro_p,
                'is_normal': shapiro_p > alpha
            },
            'agostino_test': {
                'statistic': agostino_stat,
                'p_value': agostino_p,
                'is_normal': agostino_p > alpha
            },
            'skewness': skewness,
            'kurtosis': kurtosis
        }
    
    return results

def print_normality_summary(results):
    """
    Print a human-readable summary of normality test results.
    
    Parameters:
    results: dict from check_normality function
    """
    print("\nNormality Test Summary:")
    print("-" * 80)
    
    for column, tests in results.items():
        print(f"\nFeature: {column}")
        print("  Shapiro-Wilk Test:")
        print(f"    - p-value: {tests['shapiro_test']['p_value']:.4f}")
        print(f"    - Normal: {tests['shapiro_test']['is_normal']}")
        
        print("  D'Agostino-Pearson Test:")
        print(f"    - p-value: {tests['agostino_test']['p_value']:.4f}")
        print(f"    - Normal: {tests['agostino_test']['is_normal']}")
        
        print("  Distribution Characteristics:")
        print(f"    - Skewness: {tests['skewness']:.4f}")
        print(f"    - Kurtosis: {tests['kurtosis']:.4f}")
        print("-" * 80)

def plot_normality_checks(data, figsize=(15, 5)):
    """
    Create visual checks for normality including histogram, Q-Q plot, and box plot.
    
    Parameters:
    data: pandas DataFrame or numpy array
    figsize: tuple for figure size
    """
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)
    
    for column in data.columns:
        feature_data = data[column].dropna()
        
        fig, axes = plt.subplots(1, 2, figsize=figsize)
        fig.suptitle(f'Normality Checks for {column}')
        
        # 1. Histogram with KDE
        sns.histplot(feature_data, kde=True, ax=axes[0])
        axes[0].set_title('Histogram with KDE')
        
        # # 2. Q-Q plot
        # stats.probplot(feature_data, dist="norm", plot=axes[1])
        # axes[1].set_title('Q-Q Plot')
        
        # 3. Box plot
        sns.boxplot(y=feature_data, ax=axes[1])
        axes[1].set_title('Box Plot')
        
        plt.tight_layout()
        plt.show()

In [None]:
from pprint import pprint

mri_data = X.iloc[:, 11:]

mri_data.head()

normality_results = check_normality(mri_data)

cols = list(normality_results.keys())

printRes = {}

for col in cols[:5]:
    printRes[col] = normality_results[col]

print_normality_summary(printRes)



Normality Test Summary:
--------------------------------------------------------------------------------

Feature: original_shape_Elongation
  Shapiro-Wilk Test:
    - p-value: 0.0000
    - Normal: False
  D'Agostino-Pearson Test:
    - p-value: 0.0000
    - Normal: False
  Distribution Characteristics:
    - Skewness: -0.6211
    - Kurtosis: -0.2687
--------------------------------------------------------------------------------

Feature: original_shape_Flatness
  Shapiro-Wilk Test:
    - p-value: 0.0000
    - Normal: False
  D'Agostino-Pearson Test:
    - p-value: 0.0000
    - Normal: False
  Distribution Characteristics:
    - Skewness: -0.1681
    - Kurtosis: -0.8080
--------------------------------------------------------------------------------

Feature: original_shape_LeastAxisLength
  Shapiro-Wilk Test:
    - p-value: 0.0000
    - Normal: False
  D'Agostino-Pearson Test:
    - p-value: 0.0000
    - Normal: False
  Distribution Characteristics:
    - Skewness: 0.9035
    - Kurt

In [None]:
'''
Find MRI columns that follow a normal distribution 
'''

## keep only normal data and run PCA
def get_normal_cols(normality_results):
  normal_cols = []
  for col, tests in normality_results.items():
    if tests['shapiro_test']['is_normal'] or tests['agostino_test']['is_normal']:
      normal_cols.append(col)
  return normal_cols
normal_cols = get_normal_cols(normality_results)

In [None]:
'''
Run PCA on MRI data
'''

normal_mri_data = mri_data[normal_cols]

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normal_mri_data_scaled = scaler.fit_transform(normal_mri_data)

pca = PCA(n_components=1)
X_pca = pca.fit_transform(normal_mri_data_scaled)



In [None]:
# join original data with pca data
# manifold on normal data
X_mri_normal = X_mri[normal_cols]
X_mri_normal_tsne = tsne.fit_transform(X_mri_normal)

X_pca_df = np.c_[X_non_mri, X_pca, X_mri_normal_tsne]
X_pca_df.shape

(400, 15)

In [None]:
 
Xs_selected_pca_train, Xs_selected_pca_test, y_selected_pca_train, y_selected_pca_test = train_test_split(X_pca_df, rgr_y, test_size=0.2, random_state=42)

svr = SVR(C=6, gamma=0.1, kernel='rbf')
svr.fit(Xs_selected_pca_train, y_selected_pca_train)

y_pred = svr.predict(Xs_selected_pca_test)

mae = mean_absolute_error(y_selected_pca_test, y_pred)

mse = mean_squared_error(y_selected_pca_test, y_pred)

print('Test MAE:', mae)
print('Test MSE:', mse)

Test MAE: 21.776510264442464
Test MSE: 790.5575022110554


In [None]:
# Todo:
# feature selection
### wrapper method

# try more things with imputation
## try random fill of missing values
## try dropping all
## try probabilistic imputation


