In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score
import streamlit as st

# Load the data
data = pd.read_csv('support2.csv', na_values=[' ', "-"]).fillna(0)

# Preprocessing: Encode categorical features
# Encoding Gender as a binary variable
label_encoder = LabelEncoder()
data['sex'] = label_encoder.fit_transform(data['sex'].astype(str))

# Encoding Symptoms, Causes, and Medicine using one-hot encoding
onehot_encoder = OneHotEncoder()
symptoms_encoded = onehot_encoder.fit_transform(data[['sfdm2']].astype(str)).toarray()
causes_encoded = onehot_encoder.fit_transform(data[['dzclass']].astype(str)).toarray()
medicine_encoded = onehot_encoder.fit_transform(data[['glucose']].astype(str)).toarray()

# Combining encoded columns with the rest of the data
encoded_data = pd.concat([
    data[['sex']],
    pd.DataFrame(symptoms_encoded, index=data.index),
    pd.DataFrame(causes_encoded, index=data.index),
    pd.DataFrame(medicine_encoded, index=data.index)
], axis=1)

# Convert feature names to strings
encoded_data.columns = encoded_data.columns.astype(str)

# Encoding the target variable (Disease)
data['dzgroup'] = label_encoder.fit_transform(data['dzgroup'].astype(str))

# Splitting the data into features (X) and target (y)
X = input_data
y = data['dzgroup']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building and training the Decision Tree model
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

#sv = SVC()
#sv.fit(X_train, y_train)
# Making predictions on the test set
y_pred = clf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
st.title('Medical Disease Prediction')

# Sidebar navigation
st.sidebar.title('Navigation')
prediction_type = st.sidebar.selectbox('Select a disease to predict', ['Diabetes', 'Heart Disease', 'Parkinson\'s Disease'])

# Diabetes prediction
if prediction_type == 'Diabetes':
    st.header('Diabetes Prediction')
    
    # User input
    age = st.number_input('Age', min_value=18, max_value=100, step=1)
    glucose = st.number_input('Glucose Level', min_value=0, max_value=500, step=1)
    bmi = st.number_input('BMI', min_value=10.0, max_value=50.0, step=0.1)
    
    # Make prediction
    input_data = [[age, glucose, bmi]]
    prediction = clf.predict(input_data)[0]
    
    if prediction == 0:
        st.write('The patient is not likely to have diabetes.')
    else:
        st.write('The patient is likely to have diabetes.')



0.6886326194398682




ValueError: X has 3 features, but DecisionTreeClassifier is expecting 450 features as input.