# Medical Diagnostic Model Training
This notebook trains a RandomForestClassifier to predict diseases based on patient data.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [None]:
# Load the dataset
df = pd.read_csv('../data/sample_patient_data.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)
print("\nUnique diagnoses:")
print(df['diagnosis'].value_counts())

In [None]:
# Preprocess the data
# Convert gender to numeric: Male = 0, Female = 1
df['gender_numeric'] = df['gender'].apply(lambda x: 0 if x == 'Male' else 1)

# Select features for training
# We'll use: age, gender_numeric, bp, glucose, heart_rate
feature_columns = ['age', 'gender_numeric', 'bp', 'glucose', 'heart_rate']
X = df[feature_columns]
y = df['diagnosis']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
X.head()

In [None]:
# Split data into training and testing sets
# Check if stratification is possible (need at least 2 samples per class)
class_counts = y.value_counts()
min_class_count = class_counts.min()

if min_class_count < 2:
    print(f"Warning: Some classes have only {min_class_count} sample(s)")
    print("Using random split instead of stratified split")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
# Create and train the RandomForestClassifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

print("Training the model...")
model.fit(X_train, y_train)
print("Model training completed!")

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

In [None]:
# Save the trained model
model_filename = 'diagnostic_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved successfully as {model_filename}")

In [None]:
# Test loading the saved model
loaded_model = joblib.load(model_filename)
print("Model loaded successfully!")

# Test prediction with sample data
sample_data = [[45, 0, 140, 110, 85]]  # age=45, gender=Male(0), bp=140, glucose=110, heart_rate=85
sample_df = pd.DataFrame(sample_data, columns=feature_columns)
prediction = loaded_model.predict(sample_df)
print(f"\nSample prediction for patient with features {sample_data[0]}:")
print(f"Predicted diagnosis: {prediction[0]}")