<a href="https://colab.research.google.com/github/titan-spyer/diseases_prediction/blob/main/Disease_predication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import kagglehub
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
def train_diabetes():
    print("\n--- Training Diabetes Model ---")
    path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")
    print("Dataset path:", path)
    df = pd.read_csv(os.path.join(path, 'diabetes.csv'))

    X = df.drop('Outcome', axis=1)
    y = df['Outcome']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_scaled, y_train)

    print("Accuracy:", accuracy_score(y_test, model.predict(X_test_scaled)))

    joblib.dump(model, "diabetes_model.pkl")
    joblib.dump(scaler, "diabetes_scaler.pkl")
    print("Saved diabetes_model.pkl and diabetes_scaler.pkl")

In [3]:
def train_heart():
    print("\n--- Training Heart Disease Model ---")
    path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")
    print("Dataset path:", path)

    # Check file name, usually heart.csv
    files = os.listdir(path)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    df = pd.read_csv(os.path.join(path, csv_file))

    # Target column is often 'target' or 'heart_disease'
    # In johnsmith88/heart-disease-dataset it is 'target'
    X = df.drop('target', axis=1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_scaled, y_train)

    print("Accuracy:", accuracy_score(y_test, model.predict(X_test_scaled)))

    joblib.dump(model, "heart_model.pkl")
    joblib.dump(scaler, "heart_scaler.pkl")
    print("Saved heart_model.pkl and heart_scaler.pkl")

In [4]:
def train_breast_cancer():
    print("\n--- Training Breast Cancer Model ---")
    path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data")
    print("Dataset path:", path)

    files = os.listdir(path)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    df = pd.read_csv(os.path.join(path, csv_file))

    # Drop unnecessary columns
    if 'id' in df.columns:
        df = df.drop('id', axis=1)
    if 'Unnamed: 32' in df.columns:
        df = df.drop('Unnamed: 32', axis=1)

    # User requested specific features potentially, but for now we train on all standard numeric features
    # 'diagnosis' is target (M/B). Map to 1/0
    df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

    X = df.drop('diagnosis', axis=1)
    y = df['diagnosis']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_scaled, y_train)

    print("Accuracy:", accuracy_score(y_test, model.predict(X_test_scaled)))

    joblib.dump(model, "breast_cancer_model.pkl")
    joblib.dump(scaler, "breast_cancer_scaler.pkl")
    print("Saved breast_cancer_model.pkl and breast_cancer_scaler.pkl")

if __name__ == "__main__":
    train_diabetes()
    train_heart()
    train_breast_cancer()



--- Training Diabetes Model ---
Using Colab cache for faster access to the 'pima-indians-diabetes-database' dataset.
Dataset path: /kaggle/input/pima-indians-diabetes-database
Accuracy: 0.7597402597402597
Saved diabetes_model.pkl and diabetes_scaler.pkl

--- Training Heart Disease Model ---
Using Colab cache for faster access to the 'heart-disease-dataset' dataset.
Dataset path: /kaggle/input/heart-disease-dataset
Accuracy: 1.0
Saved heart_model.pkl and heart_scaler.pkl

--- Training Breast Cancer Model ---
Using Colab cache for faster access to the 'breast-cancer-wisconsin-data' dataset.
Dataset path: /kaggle/input/breast-cancer-wisconsin-data
Accuracy: 0.9736842105263158
Saved breast_cancer_model.pkl and breast_cancer_scaler.pkl
