In [1]:
# Cancer Detection Prediction System
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# 1. Load and Prepare Data
print("Loading dataset...")
try:
    df = pd.read_csv('cancer_detection.csv')
    print("Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# 2. Data Preprocessing
print("\nPreprocessing data...")

# Convert Gender to numeric (M=0, F=1)
df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})

# Fill NA values for numeric columns only
numeric_cols = ['Age', 'Tumor_Size_cm', 'WBC_Count', 'CA_125', 'PSA', 'ALP']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# For Stage, fill with 'NA' for Healthy patients
df['Stage'].fillna('NA', inplace=True)

# Convert categorical variables to numeric codes
cancer_types = ['Lung', 'Ovarian', 'Breast', 'Colorectal', 'Prostate', 'Pancreatic', 'Healthy']
df['Cancer_Type'] = pd.Categorical(df['Cancer_Type'], categories=cancer_types).codes
df['Stage'] = pd.Categorical(df['Stage']).codes
df['Diagnosis'] = pd.Categorical(df['Diagnosis']).codes

# 3. Prepare Features and Targets
features = ['Age', 'Gender', 'Tumor_Size_cm', 'WBC_Count', 'CA_125', 'PSA', 'ALP']
X = df[features]
y = df[['Cancer_Type', 'Stage', 'Diagnosis']]

# 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData split:")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

# 5. Train Models
print("\nTraining models...")
models = {
    'Cancer_Type': RandomForestClassifier(n_estimators=150, random_state=42),
    'Stage': RandomForestClassifier(n_estimators=150, random_state=42),
    'Diagnosis': RandomForestClassifier(n_estimators=150, random_state=42)
}

for target in models.keys():
    print(f"Training {target} model...")
    models[target].fit(X_train, y_train[target])

# 6. Evaluate Models
print("\nModel Evaluation:")
for target in models.keys():
    y_pred = models[target].predict(X_test)
    print(f"\n{target} Model:")
    print(f"Accuracy: {accuracy_score(y_test[target], y_pred):.2f}")
    print("Classification Report:")
    print(classification_report(y_test[target], y_pred))

# 7. Prediction Function
def predict_cancer(age, gender, tumor_size, wbc_count, ca_125, psa, alp):
    """
    Predicts cancer type, stage and diagnosis
    
    Parameters:
    age (int): Patient age
    gender (str): 'M' or 'F'
    tumor_size (float): Tumor size in cm
    wbc_count (float): White blood cell count
    ca_125 (float): CA-125 level
    psa (float): PSA level
    alp (float): ALP level
    
    Returns:
    dict: Dictionary with predictions
    """
    try:
        # Prepare input
        gender_encoded = 0 if gender == 'M' else 1
        features = [[age, gender_encoded, tumor_size, wbc_count, ca_125, psa, alp]]
        
        # Make predictions
        predictions = {}
        for target in models.keys():
            pred = models[target].predict(features)[0]
            predictions[target] = int(pred)
        
        # Map numeric predictions back to cancer types and stages
        predictions['Cancer_Type'] = cancer_types[predictions['Cancer_Type']]
        predictions['Stage'] = ['I', 'II', 'III', 'IV', 'NA'][predictions['Stage']]
        predictions['Diagnosis'] = 'Positive' if predictions['Diagnosis'] == 1 else 'Negative'
        
        return predictions
    except Exception as e:
        return {'error': f"Prediction failed: {str(e)}"}

# 8. Example Usage
print("\nExample Prediction:")
sample_prediction = predict_cancer(62, 'M', 4.8, 14.9, 38.7, 1.8, 230)
print(sample_prediction)

# 9. Save Models (Optional)
print("\nSaving models...")
for target in models.keys():
    joblib.dump(models[target], f'{target}_model.joblib')
print("Models saved successfully!")

# To load models later:
# models = {target: joblib.load(f'{target}_model.joblib') for target in ['Cancer_Type', 'Stage', 'Diagnosis']}


Loading dataset...
Dataset loaded successfully!
Shape: (101, 11)

First 5 rows:
  Patient_ID   Age Gender Cancer_Type Stage  Tumor_Size_cm  WBC_Count  CA_125  \
0     PT-001  62.0      M        Lung   III            4.8       14.9    38.7   
1     PT-002  45.0      F     Ovarian    II            3.5       10.8   720.3   
2     PT-003  58.0      F      Breast     I            1.7        8.2    27.4   
3     PT-004  71.0      M  Colorectal    IV            7.2       17.6    41.5   
4     PT-005  53.0      M    Prostate   III            4.1       11.2    24.1   

    PSA    ALP Diagnosis  
0   1.8  230.0  Positive  
1   0.0  108.0  Positive  
2   0.0   92.0  Positive  
3   1.6  490.0  Positive  
4  42.5  295.0  Positive  

Preprocessing data...

Data split:
Training samples: 80
Testing samples: 21

Training models...
Training Cancer_Type model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Stage'].fillna('NA', inplace=True)


Training Stage model...
Training Diagnosis model...

Model Evaluation:

Cancer_Type Model:
Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       1.00      1.00      1.00         2
           2       1.00      0.67      0.80         3
           3       0.60      1.00      0.75         3
           4       1.00      1.00      1.00         4
           6       1.00      1.00      1.00         4

    accuracy                           0.90        21
   macro avg       0.93      0.91      0.91        21
weighted avg       0.94      0.90      0.91        21


Stage Model:
Accuracy: 0.95
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       0.86      1.00      0.92         6
           2       1.00      0.80      0.89         5
           3       1.00      1.00      1.00         4
     



Models saved successfully!
