<a href="https://colab.research.google.com/github/tyron-raza/Integrative-Prediction-for-Cancer-Patients/blob/master/Integrative_Prediction_for_Cancer_Patients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("omenkj/chemotherapy-regimens-based-on-patient-data")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'chemotherapy-regimens-based-on-patient-data' dataset.
Path to dataset files: /kaggle/input/chemotherapy-regimens-based-on-patient-data


In [None]:
import os
import pandas as pd

# Path to the dataset folder
path = "/kaggle/input/chemotherapy-regimens-based-on-patient-data"

# List files to confirm
print("Files in dataset folder:", os.listdir(path))

# Load the correct CSV file
csv_file = os.path.join(path, "chemotherapy_patient_data.csv")
df = pd.read_csv(csv_file)

# View the first few rows
# print(df.head())

df


Files in dataset folder: ['chemotherapy_patient_data.csv', 'README.md']


Unnamed: 0,Patient_ID,Age,Sex,BMI,Smoking_Status,Cancer_Type,Genetic_Mutation,Tumor_Stage,Tumor_Size,Metastasis_Status,Chemotherapy_Regimen,Dosage (mg/m²),Cycles_Completed,Nausea_Severity,Neutropenia,Tumor_Response,Overall_Survival_Months
0,P00001,68,Male,31.5,Former,Breast,BRCA1,II,8.8,No,FOLFOX,352.3,5,1,Yes,Stable,88
1,P00002,81,Female,25.8,Former,Lung,KRAS,I,5.4,Yes,CHOP,374.3,6,3,No,Stable,17
2,P00003,58,Male,22.3,Former,Lymphoma,BRCA1,II,3.3,No,ABVD,83.1,5,4,Yes,Partial,59
3,P00004,44,Male,33.6,Never,Lymphoma,EGFR,IV,6.0,No,FOLFOX,58.7,5,3,No,Partial,47
4,P00005,72,Male,23.7,Never,Breast,TP53,III,5.7,No,FOLFOX,429.2,6,2,Yes,Progressive,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52316,P52317,78,Female,29.3,Current,Breast,KRAS,III,1.9,No,ABVD,304.1,4,3,No,Partial,54
52317,P52318,60,Female,22.3,Current,Breast,KRAS,I,7.5,No,Gemcitabine,222.7,3,5,No,Stable,118
52318,P52319,73,Male,31.3,Never,Lymphoma,BRCA1,I,2.5,No,,247.0,1,1,No,Partial,37
52319,P52320,44,Male,32.6,Never,Leukemia,TP53,I,8.4,No,CHOP,245.4,4,5,No,Progressive,22


In [None]:
import pandas as pd

# Save CSV in the current directory
output_file = "chemotherapy_patient_data_copy.csv"
df.to_csv(output_file, index=False)

print("CSV saved as:", output_file)

# Download file to your computer
from google.colab import files
files.download(output_file)




CSV saved as: chemotherapy_patient_data_copy.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Research Objectives ------------->

#Integrative Prediction of Chemotherapy Regimens and Overall Survival in Cancer Patients Using XGBoost and Cox Regression


#1. Based on the patient’s profile (age, sex, BMI, smoking status, cancer type, genetic mutation, tumor stage/size, metastasis), predict which chemotherapy regimen/dosage (in range)/cycles completed is recommended using both XGBOOST.
#2. Rank the predictions based on some metric.

#WORK
# Find more suitable models
# Encode Overall Survival Months.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint, uniform

def train_models(data):
    if data is None:
        return None, None, None, None, None

    print("\nTraining optimized models...")

    # Features including regimen where needed
    dosage_features = patient_features + [chemo_regimen_target]
    cycles_features = patient_features + [chemo_regimen_target]
    survival_features = patient_features + [chemo_regimen_target, dosage_target, cycles_target]

    # Preprocessors
    base_preprocessor = ColumnTransformer([
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')),
                          ('scaler', StandardScaler())]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    dosage_cycles_preprocessor = ColumnTransformer([
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')),
                          ('scaler', StandardScaler())]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'),
         categorical_features + [chemo_regimen_target])
    ])

    survival_preprocessor = ColumnTransformer([
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')),
                          ('scaler', StandardScaler())]), numerical_features + [dosage_target, cycles_target]),
        ('cat', OneHotEncoder(handle_unknown='ignore'),
         categorical_features + [chemo_regimen_target])
    ])

    # ---------------------
    # Regimen classifier
    # ---------------------
    regimen_encoder = LabelEncoder()
    y_regimen = regimen_encoder.fit_transform(data[chemo_regimen_target])
    X_train, X_val, y_train, y_val = train_test_split(
        data[patient_features], y_regimen, test_size=0.2, stratify=y_regimen, random_state=42
    )

    clf = XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        n_estimators=2000,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist"
    )

    regimen_model = Pipeline([
        ('preprocessor', base_preprocessor),
        ('classifier', clf)
    ])

    regimen_model.fit(X_train, y_train,
                      classifier__eval_set=[(X_val, y_val)],
                      classifier__early_stopping_rounds=50,
                      classifier__verbose=False)

    y_val_pred = regimen_model.predict(X_val)
    print(f"Regimen Accuracy: {accuracy_score(y_val, y_val_pred):.3f}, "
          f"F1-macro: {f1_score(y_val, y_val_pred, average='macro'):.3f}")

    # ---------------------
    # Dosage regressor
    # ---------------------
    y_dosage = data[dosage_target]
    X_train, X_val, y_train, y_val = train_test_split(
        data[dosage_features], y_dosage, test_size=0.2, random_state=42
    )

    reg = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=2000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist"
    )

    dosage_model = Pipeline([
        ('preprocessor', dosage_cycles_preprocessor),
        ('regressor', reg)
    ])

    dosage_model.fit(X_train, y_train,
                     regressor__eval_set=[(X_val, y_val)],
                     regressor__early_stopping_rounds=50,
                     regressor__verbose=False)

    y_val_pred = dosage_model.predict(X_val)
    print(f"Dosage R2: {r2_score(y_val, y_val_pred):.3f}, MAE: {mean_absolute_error(y_val, y_val_pred):.2f}")

    # ---------------------
    # Cycles regressor
    # ---------------------
    y_cycles = data[cycles_target]
    X_train, X_val, y_train, y_val = train_test_split(
        data[cycles_features], y_cycles, test_size=0.2, random_state=42
    )

    cycles_model = Pipeline([
        ('preprocessor', dosage_cycles_preprocessor),
        ('regressor', reg)
    ])

    cycles_model.fit(X_train, y_train,
                     regressor__eval_set=[(X_val, y_val)],
                     regressor__early_stopping_rounds=50,
                     regressor__verbose=False)

    y_val_pred = cycles_model.predict(X_val)
    print(f"Cycles R2: {r2_score(y_val, y_val_pred):.3f}, MAE: {mean_absolute_error(y_val, y_val_pred):.2f}")

    # ---------------------
    # Survival regressor
    # ---------------------
    y_survival = data[survival_target]
    X_train, X_val, y_train, y_val = train_test_split(
        data[survival_features], y_survival, test_size=0.2, random_state=42
    )

    survival_model = Pipeline([
        ('preprocessor', survival_preprocessor),
        ('regressor', reg)
    ])

    survival_model.fit(X_train, y_train,
                       regressor__eval_set=[(X_val, y_val)],
                       regressor__early_stopping_rounds=50,
                       regressor__verbose=False)

    y_val_pred = survival_model.predict(X_val)
    print(f"Survival R2: {r2_score(y_val, y_val_pred):.3f}, MAE: {mean_absolute_error(y_val, y_val_pred):.2f}")

    return regimen_model, dosage_model, cycles_model, survival_model, regimen_encoder


In [None]:
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    r2_score, mean_absolute_error, mean_squared_error
)
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_models(data, regimen_model, dosage_model, cycles_model, survival_model, regimen_encoder, test_size=0.2, random_state=42):
    print("\n🔍 Evaluating models with proper train/test split...")

    # -------------------------
    # 1. Regimen Classification
    # -------------------------
    X_regimen = data[patient_features]
    y_regimen = regimen_encoder.transform(data[chemo_regimen_target])

    X_train, X_test, y_train, y_test = train_test_split(
        X_regimen, y_regimen,
        test_size=test_size, stratify=y_regimen, random_state=random_state
    )

    # Evaluate on train/test
    y_train_pred = regimen_model.predict(X_train)
    y_test_pred = regimen_model.predict(X_test)

    print("\n=== Regimen Classification ===")
    print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.3f}")
    print(f"Test Accuracy : {accuracy_score(y_test, y_test_pred):.3f}")
    print("\nClassification Report (Test):\n", classification_report(
        y_test, y_test_pred, target_names=regimen_encoder.classes_
    ))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=regimen_encoder.classes_,
                yticklabels=regimen_encoder.classes_)
    plt.title("Confusion Matrix - Regimen Classifier")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig("confusion_matrix_regimen.png", dpi=200)
    plt.close()

    # -------------------------
    # 2. Dosage Regression
    # -------------------------
    X_dosage = data[patient_features + [chemo_regimen_target]]
    y_dosage = data[dosage_target]
    X_train, X_test, y_train, y_test = train_test_split(
        X_dosage, y_dosage, test_size=test_size, random_state=random_state
    )

    y_train_pred = dosage_model.predict(X_train)
    y_test_pred = dosage_model.predict(X_test)

    print("\n=== Dosage Prediction ===")
    print(f"Train R²: {r2_score(y_train, y_train_pred):.3f}, Train MAE: {mean_absolute_error(y_train, y_train_pred):.2f}")
    print(f"Test  R²: {r2_score(y_test, y_test_pred):.3f}, Test  MAE: {mean_absolute_error(y_test, y_test_pred):.2f}")
    print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.2f}")

    # -------------------------
    # 3. Cycles Regression
    # -------------------------
    X_cycles = data[patient_features + [chemo_regimen_target]]
    y_cycles = data[cycles_target]
    X_train, X_test, y_train, y_test = train_test_split(
        X_cycles, y_cycles, test_size=test_size, random_state=random_state
    )

    y_train_pred = cycles_model.predict(X_train)
    y_test_pred = cycles_model.predict(X_test)

    print("\n=== Cycles Prediction ===")
    print(f"Train R²: {r2_score(y_train, y_train_pred):.3f}, Train MAE: {mean_absolute_error(y_train, y_train_pred):.2f}")
    print(f"Test  R²: {r2_score(y_test, y_test_pred):.3f}, Test  MAE: {mean_absolute_error(y_test, y_test_pred):.2f}")
    print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.2f}")

    # -------------------------
    # 4. Survival Regression
    # -------------------------
    X_survival = data[patient_features + [chemo_regimen_target, dosage_target, cycles_target]]
    y_survival = data[survival_target]
    X_train, X_test, y_train, y_test = train_test_split(
        X_survival, y_survival, test_size=test_size, random_state=random_state
    )

    y_train_pred = survival_model.predict(X_train)
    y_test_pred = survival_model.predict(X_test)

    print("\n=== Survival Prediction ===")
    print(f"Train R²: {r2_score(y_train, y_train_pred):.3f}, Train MAE: {mean_absolute_error(y_train, y_train_pred):.2f}")
    print(f"Test  R²: {r2_score(y_test, y_test_pred):.3f}, Test  MAE: {mean_absolute_error(y_test, y_test_pred):.2f}")
    print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.2f}")

    # Residuals plot
    residuals = y_test - y_test_pred
    plt.figure(figsize=(6,4))
    sns.histplot(residuals, bins=30, kde=True)
    plt.title("Residuals Distribution - Survival Regressor (Test)")
    plt.xlabel("True - Predicted")
    plt.tight_layout()
    plt.savefig("residuals_survival.png", dpi=200)
    plt.close()


Statistic Visualizations


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc
import numpy as np

# Set global style for paper-quality figures
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.rcParams["figure.dpi"] = 200


def visualize_dataset(data):
    """Generate dataset-level exploratory visualizations."""
    print("\n📊 Generating dataset visualizations...")

    # Distribution of Age
    plt.figure(figsize=(6, 4))
    sns.histplot(data["Age"], bins=30, kde=True, color="steelblue")
    plt.title("Age Distribution of Patients")
    plt.xlabel("Age")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig("age_distribution.png")
    plt.close()

    # Regimen Frequency
    plt.figure(figsize=(7, 4))
    sns.countplot(y="Chemotherapy_Regimen", data=data, order=data["Chemotherapy_Regimen"].value_counts().index)
    plt.title("Distribution of Chemotherapy Regimens")
    plt.xlabel("Number of Patients")
    plt.ylabel("Regimen")
    plt.tight_layout()
    plt.savefig("regimen_distribution.png")
    plt.close()

    # Survival Distribution
    plt.figure(figsize=(6, 4))
    sns.histplot(data[survival_target], bins=30, kde=True, color="darkred")
    plt.title("Distribution of Overall Survival (Months)")
    plt.xlabel("Survival (Months)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig("survival_distribution.png")
    plt.close()

    # Tumor size vs Survival
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x="Tumor_Size", y=survival_target, hue="Tumor_Stage", data=data, alpha=0.7)
    plt.title("Tumor Size vs. Survival")
    plt.xlabel("Tumor Size (cm)")
    plt.ylabel("Overall Survival (Months)")
    plt.tight_layout()
    plt.savefig("tumor_vs_survival.png")
    plt.close()


def visualize_model_performance(data, regimen_model, dosage_model, cycles_model, survival_model, regimen_encoder):
    """Generate performance visualizations for all trained models."""

    print("\n📈 Generating model performance visualizations...")

    # ----------------- Classification: Regimen -----------------
    X_regimen = data[patient_features]
    y_true = regimen_encoder.transform(data[chemo_regimen_target])
    y_pred = regimen_model.predict(X_regimen)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=regimen_encoder.classes_,
                yticklabels=regimen_encoder.classes_)
    plt.title("Confusion Matrix - Regimen Classifier")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig("confusion_matrix_regimen.png")
    plt.close()

    # ROC Curve (One-vs-Rest)
    if hasattr(regimen_model.named_steps['classifier'], "predict_proba"):
        y_proba = regimen_model.predict_proba(X_regimen)
        plt.figure(figsize=(7, 5))
        for i, cls in enumerate(regimen_encoder.classes_):
            fpr, tpr, _ = roc_curve(y_true == i, y_proba[:, i])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f"{cls} (AUC={roc_auc:.2f})")
        plt.plot([0, 1], [0, 1], "k--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curves - Regimen Classifier")
        plt.legend()
        plt.tight_layout()
        plt.savefig("roc_regimen.png")
        plt.close()

    # ----------------- Regression Models -----------------
    def regression_plots(X, y_true, model, label):
        """Helper to create scatter + residual plots for regressors."""
        y_pred = model.predict(X)

        # Scatter Plot
        plt.figure(figsize=(6, 5))
        sns.scatterplot(x=y_true, y=y_pred, alpha=0.6, edgecolor=None)
        plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
        plt.xlabel("True Values")
        plt.ylabel("Predicted Values")
        plt.title(f"{label} - Predicted vs True")
        plt.tight_layout()
        plt.savefig(f"{label.lower().replace(' ', '_')}_scatter.png")
        plt.close()

        # Residual Plot
        residuals = y_true - y_pred
        plt.figure(figsize=(6, 4))
        sns.histplot(residuals, bins=30, kde=True, color="darkorange")
        plt.title(f"{label} - Residual Distribution")
        plt.xlabel("Residual (True - Predicted)")
        plt.tight_layout()
        plt.savefig(f"{label.lower().replace(' ', '_')}_residuals.png")
        plt.close()

    # Dosage Regression
    regression_plots(data[patient_features + [chemo_regimen_target]], data[dosage_target], dosage_model, "Dosage Prediction")

    # Cycles Regression
    regression_plots(data[patient_features + [chemo_regimen_target]], data[cycles_target], cycles_model, "Cycles Prediction")

    # Survival Regression
    regression_plots(data[patient_features + [chemo_regimen_target, dosage_target, cycles_target]],
                     data[survival_target], survival_model, "Survival Prediction")

    # ----------------- Feature Importance -----------------
    # Extract feature importance from XGBoost models (if available)
    for name, model in [("Regimen", regimen_model.named_steps['classifier']),
                        ("Dosage", dosage_model.named_steps['regressor']),
                        ("Cycles", cycles_model.named_steps['regressor']),
                        ("Survival", survival_model.named_steps['regressor'])]:

        if hasattr(model, "feature_importances_"):
            importances = model.feature_importances_
            feature_names = model.feature_names_in_ if hasattr(model, "feature_names_in_") else np.arange(len(importances))

            importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
            importance_df = importance_df.sort_values("Importance", ascending=False).head(15)

            plt.figure(figsize=(8, 5))
            sns.barplot(x="Importance", y="Feature", data=importance_df, palette="viridis")
            plt.title(f"Top Features - {name} Model")
            plt.tight_layout()
            plt.savefig(f"feature_importance_{name.lower()}.png")
            plt.close()
