In [1]:
pip install pandas scikit-learn



In [5]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import make_scorer, accuracy_score, mean_absolute_error

# Load the training data
data = pd.read_excel('TrainDataset2024.xls')

# Separate features and targets
X = data.drop(['ID', 'pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)
y_pCR = data['pCR (outcome)']
y_RFS = data['RelapseFreeSurvival (outcome)']

# Handle missing data using Simple Imputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Normalize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Define important clinical features
important_features = ['ER', 'HER2', 'Gene']
clinical_features = X.columns.tolist()
X_combined = pd.DataFrame(X_scaled, columns=clinical_features)

# Set up k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define and evaluate MLP for PCR
mlp_pCR = MLPClassifier(random_state=42, max_iter=1000)
cv_scores_pCR = cross_val_score(mlp_pCR, X_combined, y_pCR, cv=kf, scoring='balanced_accuracy')
print(f'Balanced Accuracy for PCR (cross-validated): {cv_scores_pCR.mean()} ± {cv_scores_pCR.std()}')

# Define and evaluate MLP for RFS
mlp_RFS = MLPRegressor(random_state=42, max_iter=1000)
cv_scores_RFS = cross_val_score(mlp_RFS, X_combined, y_RFS, cv=kf, scoring=make_scorer(mean_absolute_error, greater_is_better=False))
print(f'Mean Absolute Error for RFS (cross-validated): {-cv_scores_RFS.mean()} ± {-cv_scores_RFS.std()}')

# Final Testing Code
# Load test dataset
test_data = pd.read_excel('TestDatasetExample.xls')

# Handle missing data in the test dataset
test_imputed = imputer.transform(test_data.drop('ID', axis=1))
test_scaled = scaler.transform(test_imputed)

# Prepare test data for predictions
test_combined = pd.DataFrame(test_scaled, columns=clinical_features)

# Fit the models on the entire dataset
mlp_pCR.fit(X_combined, y_pCR)
mlp_RFS.fit(X_combined, y_RFS)

# Make predictions on the test set
test_predictions_pCR = mlp_pCR.predict(test_combined)
test_predictions_RFS = mlp_RFS.predict(test_combined)

# Save predictions to CSV
pd.DataFrame({
    'ID': test_data['ID'],
    'Predicted pCR': test_predictions_pCR,
    'Predicted RFS': test_predictions_RFS
}).to_csv('Predictions.csv', index=False)



Balanced Accuracy for PCR (cross-validated): 0.39719743469214647 ± 0.03035958958791769




Mean Absolute Error for RFS (cross-validated): 28.03367017793473 ± -6.885706187843169


