In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("omenkj/chemotherapy-regimens-based-on-patient-data")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'chemotherapy-regimens-based-on-patient-data' dataset.
Path to dataset files: /kaggle/input/chemotherapy-regimens-based-on-patient-data


In [None]:
import os
import pandas as pd

# Path to the dataset folder
path = "/kaggle/input/chemotherapy-regimens-based-on-patient-data"

# List files to confirm
print("Files in dataset folder:", os.listdir(path))

# Load the correct CSV file
csv_file = os.path.join(path, "chemotherapy_patient_data.csv")
df = pd.read_csv(csv_file)

# View the first few rows
# print(df.head())

df

Files in dataset folder: ['chemotherapy_patient_data.csv', 'README.md', '.nfs000000003f1bbe7d00000071']


Unnamed: 0,Patient_ID,Age,Sex,BMI,Smoking_Status,Cancer_Type,Genetic_Mutation,Tumor_Stage,Tumor_Size,Metastasis_Status,Chemotherapy_Regimen,Dosage (mg/m²),Cycles_Completed,Nausea_Severity,Neutropenia,Tumor_Response,Overall_Survival_Months
0,P00001,68,Male,31.5,Former,Breast,BRCA1,II,8.8,No,FOLFOX,352.3,5,1,Yes,Stable,88
1,P00002,81,Female,25.8,Former,Lung,KRAS,I,5.4,Yes,CHOP,374.3,6,3,No,Stable,17
2,P00003,58,Male,22.3,Former,Lymphoma,BRCA1,II,3.3,No,ABVD,83.1,5,4,Yes,Partial,59
3,P00004,44,Male,33.6,Never,Lymphoma,EGFR,IV,6.0,No,FOLFOX,58.7,5,3,No,Partial,47
4,P00005,72,Male,23.7,Never,Breast,TP53,III,5.7,No,FOLFOX,429.2,6,2,Yes,Progressive,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52316,P52317,78,Female,29.3,Current,Breast,KRAS,III,1.9,No,ABVD,304.1,4,3,No,Partial,54
52317,P52318,60,Female,22.3,Current,Breast,KRAS,I,7.5,No,Gemcitabine,222.7,3,5,No,Stable,118
52318,P52319,73,Male,31.3,Never,Lymphoma,BRCA1,I,2.5,No,,247.0,1,1,No,Partial,37
52319,P52320,44,Male,32.6,Never,Leukemia,TP53,I,8.4,No,CHOP,245.4,4,5,No,Progressive,22


In [None]:
#Research Objectives ------------->

#Integrative Prediction of Chemotherapy Regimens and Overall Survival in Cancer Patients Using XGBoost and Cox Regression


#1. Based on the patient’s profile (age, sex, BMI, smoking status, cancer type, genetic mutation, tumor stage/size, metastasis), predict which chemotherapy regimen/dosage (in range)/cycles completed is recommended using both XGBOOST.
#2. Rank the predictions based on some metric.

#WORK
# Find more suitable models
# Encode Overall Survival Months.

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier, XGBRegressor
import warnings

# Suppress XGBoost warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')

def load_and_preprocess_data():
    """
    Loads the dataset and prepares it for model training.
    Assumes the dataset contains all necessary columns for the analysis.
    """
    try:
        # Path to the dataset folder on Kaggle
        path = "/kaggle/input/chemotherapy-regimens-based-on-patient-data"
        csv_file = os.path.join(path, "chemotherapy_patient_data.csv")
        df = pd.read_csv(csv_file)
        print("Dataset loaded successfully.")
        print("\nDataset Head:")
        print(df.head())

        # Drop any rows with missing values to simplify
        df.dropna(inplace=True)

        # We will assume a column named 'Overall_Survival_Months' exists for ranking.
        # If your dataset has a different column for this, please update the code below.

        return df

    except FileNotFoundError:
        print(f"Error: The file '{csv_file}' was not found. Please ensure the file path is correct.")
        return None
    except Exception as e:
        print(f"An error occurred while loading the dataset: {e}")
        return None

# Define Features and Targets, Preprocessing Pipeline

# Patient profile features based on the prompt
# Note: These column names are assumed. Please adjust if they differ in your CSV.
patient_features = ['Age', 'Sex', 'BMI', 'Smoking_Status', 'Cancer_Type', 'Genetic_Mutation',
                    'Tumor_Stage', 'Tumor_Size', 'Metastasis_Status']

# Targets for prediction
chemo_regimen_target = 'Chemotherapy_Regimen'
dosage_target = 'Dosage (mg/m²)'
cycles_target = 'Cycles_Completed'
survival_target = 'Overall_Survival_Months' # Assumed column for ranking

# Define a preprocessor for the features
categorical_features = ['Sex', 'Smoking_Status', 'Cancer_Type', 'Genetic_Mutation',
                        'Tumor_Stage', 'Metastasis_Status']
numerical_features = ['Age', 'BMI', 'Tumor_Size']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# 3. Model Training
def train_models(data):
    """
    Trains the required XGBoost models.
    """
    if data is None:
        return None, None, None, None, None

    print("\nTraining models...")

    # Train model for Chemotherapy_Regimen (Classification)
    # The target needs to be numerical for XGBoost
    regimen_encoder = LabelEncoder()
    X = data[patient_features]
    y_regimen = regimen_encoder.fit_transform(data[chemo_regimen_target])
    regimen_model = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', XGBClassifier(objective='multi:softmax', eval_metric='mlogloss'))])
    regimen_model.fit(X, y_regimen)
    print("Regimen model trained.")

    # Train model for Dosage (Regression)
    y_dosage = data[dosage_target]
    dosage_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', XGBRegressor(objective='reg:squarederror'))])
    dosage_model.fit(X, y_dosage)
    print("Dosage model trained.")

    # Train model for Cycles_Completed (Regression)
    y_cycles = data[cycles_target]
    cycles_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', XGBRegressor(objective='reg:squarederror'))])
    cycles_model.fit(X, y_cycles)
    print("Cycles model trained.")

    # Train a separate model for Overall_Survival_Months
    # This model uses both patient features and treatment features (dosage and cycles)
    survival_features = patient_features + [dosage_target, cycles_target]
    X_survival = data[survival_features]
    y_survival = data[survival_target]

    # Preprocessor for the survival model's features
    survival_preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features + [dosage_target, cycles_target]),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    survival_model = Pipeline(steps=[('preprocessor', survival_preprocessor),
                                     ('regressor', XGBRegressor(objective='reg:squarederror'))])
    survival_model.fit(X_survival, y_survival)
    print("Survival model trained.")

    return regimen_model, dosage_model, cycles_model, survival_model, regimen_encoder

# 4. Prediction and Ranking Logic
def predict_and_rank_regimens(patient_profile, regimen_model, dosage_model, cycles_model, survival_model, regimen_encoder, top_n=5, all_regimens=None):
    """
    Predicts and ranks chemotherapy regimens for a given patient profile.
    """
    if all_regimens is None:
        print("Warning: No list of regimens provided. Using a default set.")
        all_regimens = ['FOLFOX', 'CHOP', 'ABVD', 'Gemcitabine', 'Paclitaxel', 'Carboplatin']

    patient_df = pd.DataFrame([patient_profile])

    predictions = []

    # Iterate through all possible regimens and predict outcomes
    for regimen in all_regimens:
        # Create a hypothetical profile for the current regimen
        temp_profile = patient_df.copy()
        temp_profile['Chemotherapy_Regimen'] = regimen

        # Predict the dosage and cycles for this regimen
        predicted_dosage = dosage_model.predict(temp_profile[patient_features])[0]
        predicted_cycles = cycles_model.predict(temp_profile[patient_features])[0]

        # Now, predict the overall survival using the full set of features
        survival_input_df = temp_profile.copy()
        survival_input_df[dosage_target] = predicted_dosage
        survival_input_df[cycles_target] = predicted_cycles

        predicted_survival = survival_model.predict(survival_input_df[patient_features + [dosage_target, cycles_target]])[0]

        predictions.append({
            'Regimen': regimen,
            'Predicted_Dosage': predicted_dosage.round(1),
            'Predicted_Cycles': int(np.round(predicted_cycles)),
            'Predicted_Survival_Months': max(0, predicted_survival.round(1))
        })

    # Rank the predictions by Overall_Survival_Months in descending order
    ranked_predictions = sorted(predictions, key=lambda x: x['Predicted_Survival_Months'], reverse=True)

    return ranked_predictions[:top_n]

if __name__ == "__main__":
    # --- Main Execution ---

    # 1. Load the real dataset
    data_df = load_and_preprocess_data()

    if data_df is None:
        exit()

    # Get the unique list of regimens from the dataset
    all_possible_regimens = data_df['Chemotherapy_Regimen'].unique().tolist()

    # 2. Train the models
    regimen_model, dosage_model, cycles_model, survival_model, regimen_encoder = train_models(data_df)

    print("\n" + "-" * 30)

    # 3. Create a sample patient profile to get recommendations
    # This profile should have the same features as the training data
    sample_patient = {
        'Age': 55,
        'Sex': 'Female',
        'BMI': 28.5,
        'Smoking_Status': 'Former',
        'Cancer_Type': 'Breast',
        'Genetic_Mutation': 'BRCA1',
        'Tumor_Stage': 'II',
        'Tumor_Size': 4.2,
        'Metastasis_Status': 'No'
    }

    print("Patient Profile:")
    for k, v in sample_patient.items():
        print(f"  {k}: {v}")

    print("-" * 30)

    # 4. Predict and rank the top 5 regimens for this patient
    top_regimens = predict_and_rank_regimens(sample_patient, regimen_model, cycles_model, dosage_model, survival_model,
                                             regimen_encoder, top_n=5, all_regimens=all_possible_regimens)

    print("Predicted and Ranked Chemotherapy Regimens:")
    for rank, p in enumerate(top_regimens):
        print(f"\nRank {rank + 1}: {p['Regimen']}")
        print(f"  Predicted Dosage: {p['Predicted_Dosage']} mg/m²")
        print(f"  Predicted Cycles: {p['Predicted_Cycles']}")
        print(f"  Predicted Overall Survival: {p['Predicted_Survival_Months']} months")



Dataset loaded successfully.

Dataset Head:
  Patient_ID  Age     Sex   BMI Smoking_Status Cancer_Type Genetic_Mutation  \
0     P00001   68    Male  31.5         Former      Breast            BRCA1   
1     P00002   81  Female  25.8         Former        Lung             KRAS   
2     P00003   58    Male  22.3         Former    Lymphoma            BRCA1   
3     P00004   44    Male  33.6          Never    Lymphoma             EGFR   
4     P00005   72    Male  23.7          Never      Breast             TP53   

  Tumor_Stage  Tumor_Size Metastasis_Status Chemotherapy_Regimen  \
0          II         8.8                No               FOLFOX   
1           I         5.4               Yes                 CHOP   
2          II         3.3                No                 ABVD   
3          IV         6.0                No               FOLFOX   
4         III         5.7                No               FOLFOX   

   Dosage (mg/m²)  Cycles_Completed  Nausea_Severity Neutropenia  \
0   