In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import streamlit as st

# Define Data Paths
data_paths = {
    "Parkinson's": "C:/Users/Gebruiker/Downloads/parkinsons.xlsx",
    "Kidney Disease": "C:/Users/Gebruiker/Downloads/kidney_disease.xlsx",
    "Liver Disease": "C:/Users/Gebruiker/Downloads/indian_liver_patient.xlsx"
}

# Define Model and Scaler Paths
model_paths = {
    "Parkinson's": r"C:\Users\Gebruiker\parkinsons_model.pkl",
    "Kidney Disease": r"C:\Users\Gebruiker\kidney_model.pkl",
    "Liver Disease": r"C:\Users\Gebruiker\liver_model.pkl"
}


scaler_paths = {
    "Parkinson's": "C:/Users/Gebruiker/parkinsons_scaler.pkl",
    "Kidney Disease": "C:/Users/Gebruiker/kidney_scaler.pkl",
    "Liver Disease": "C:/Users/Gebruiker/liver_scaler.pkl"
}

# Define Features for Each Disease
features_count = {"Parkinson's": 22, "Kidney Disease": 25, "Liver Disease": 10}

# Preprocessing and Training Function
def preprocess_and_train(data_path, target_column, n_features, model_path, scaler_path):
    try:
        # Load data
        data = pd.read_excel(data_path)

        # Handle missing values
        data.fillna(data.median(numeric_only=True), inplace=True)

        # Encode categorical data
        for col in data.select_dtypes(include=['object']).columns:
            encoder = LabelEncoder()
            data[col] = encoder.fit_transform(data[col])

        # Split features and target
        X = data.iloc[:, :-1]  # Select all columns except the last as features
        y = data.iloc[:, target_column]  # Select the target column

        # Select only the required number of features
        X = X.iloc[:, :n_features]

        # Scale features
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        # Train model
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)

        # Evaluate model
        y_pred = model.predict(X_test)
        print(f"Model Evaluation for {data_path.split('/')[-1].split('.')[0]}:\n")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

        # Save model and scaler
        joblib.dump(model, model_path)
        joblib.dump(scaler, scaler_path)

    except Exception as e:
        print(f"Error processing {data_path}: {e}")

# Train Models for Each Disease
preprocess_and_train(
    data_path=data_paths["Parkinson's"],
    target_column=-1,  # Target is the last column
    n_features=22,
    model_path=model_paths["Parkinson's"],
    scaler_path=scaler_paths["Parkinson's"]
)

preprocess_and_train(
    data_path=data_paths["Kidney Disease"],
    target_column=-1,  # Target is the last column
    n_features=25,
    model_path=model_paths["Kidney Disease"],
    scaler_path=scaler_paths["Kidney Disease"]
)

preprocess_and_train(
    data_path=data_paths["Liver Disease"],
    target_column=-1,  # Target is the last column
    n_features=10,
    model_path=model_paths["Liver Disease"],
    scaler_path=scaler_paths["Liver Disease"]
)

# Streamlit Application
st.title("Multiple Disease Prediction System")
st.sidebar.header("Input Features")
disease = st.sidebar.selectbox("Select Disease to Predict", list(data_paths.keys()))

# Input features based on the disease
inputs = [st.sidebar.number_input(f"Feature {i+1}", key=f"{disease}_feature_{i}") for i in range(features_count[disease])]

# Convert to numpy array
input_values = np.array(inputs).reshape(1, -1)

# Load models and scalers
models = {key: joblib.load(path) for key, path in model_paths.items()}
scalers = {key: joblib.load(path) for key, path in scaler_paths.items()}

# Scale inputs
input_values_scaled = scalers[disease].transform(input_values)

# Predict and visualize results
if st.button("Predict"):
    model = models[disease]
    prediction_proba = model.predict_proba(input_values_scaled)[0]
    prediction = model.predict(input_values_scaled)[0]

    st.write(f"Prediction for {disease}: {'Positive' if prediction == 1 else 'Negative'}")
    st.write("Prediction Probabilities:")
    st.write({f"Class {i}": f"{prob:.2%}" for i, prob in enumerate(prediction_proba)})

    # Visualization
    st.subheader("Prediction Probabilities Visualization")
    sns.barplot(x=[f"Class {i}" for i in range(len(prediction_proba))], y=prediction_proba)
    plt.xlabel("Classes")
    plt.ylabel("Probability")
    st.pyplot(plt)


Error processing C:/Users/Gebruiker/Downloads/parkinsons.xlsx: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
Error processing C:/Users/Gebruiker/Downloads/kidney_disease.xlsx: Encoders require their input argument must be uniformly strings or numbers. Got ['float', 'int', 'str']
Model Evaluation for indian_liver_patient:

Accuracy: 0.7350427350427351
Classification Report:
               precision    recall  f1-score   support

           1       0.80      0.86      0.83        87
           2       0.48      0.37      0.42        30

    accuracy                           0.74       117
   macro avg       0.64      0.61      0.62       117
weighted avg       0.72      0.74      0.72       117

Confusion Matrix:
 [[75 12]
 [19 11]]
ROC-AUC: 0.8078544061302682


2025-01-21 14:46:21.307 
  command:

    streamlit run C:\Users\Gebruiker\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-01-21 14:46:21.309 Session state does not function when running a script without `streamlit run`
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
