In [38]:
import pandas as pd

# Load the CSV file
file_path = 'disease_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [39]:
# Combine symptoms into a single list per disease
# Drop any entirely empty symptom columns
data = data.dropna(axis=1, how='all')

# Combine all symptom columns into one list for each row
data['Symptoms'] = data.iloc[:, 1:].values.tolist()

# Drop the individual symptom columns as they are now combined
data = data[['Disease', 'Symptoms']]

# Convert NaN values to empty strings and filter symptoms
data['Symptoms'] = data['Symptoms'].apply(lambda x: [symptom for symptom in x if pd.notna(symptom)])

# Trim whitespace from each symptom in the Symptoms column
data['Symptoms'] = data['Symptoms'].apply(lambda x: [symptom.strip() for symptom in x if pd.notna(symptom)])

# Display the cleaned data
data.head()

Unnamed: 0,Disease,Symptoms
0,Fungal infection,"[itching, skin_rash, nodal_skin_eruptions, dis..."
1,Fungal infection,"[skin_rash, nodal_skin_eruptions, dischromic _..."
2,Fungal infection,"[itching, nodal_skin_eruptions, dischromic _pa..."
3,Fungal infection,"[itching, skin_rash, dischromic _patches]"
4,Fungal infection,"[itching, skin_rash, nodal_skin_eruptions]"


In [40]:
from sklearn.preprocessing import MultiLabelBinarizer

# Use MultiLabelBinarizer to convert symptoms to binary features
mlb = MultiLabelBinarizer()
symptom_features = mlb.fit_transform(data['Symptoms'])

# Convert the binary matrix to a DataFrame and add the Disease column
symptom_df = pd.DataFrame(symptom_features, columns=mlb.classes_)
symptom_df['Disease'] = data['Disease']

# Display the processed data
symptom_df.head()

Unnamed: 0,abdominal_pain,abnormal_menstruation,acidity,acute_liver_failure,altered_sensorium,anxiety,back_pain,belly_pain,blackheads,bladder_discomfort,...,watering_from_eyes,weakness_in_limbs,weakness_of_one_body_side,weight_gain,weight_loss,yellow_crust_ooze,yellow_urine,yellowing_of_eyes,yellowish_skin,Disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data into features (X) and labels (y)
X = symptom_df.drop('Disease', axis=1)
y = symptom_df['Disease']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and display accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

1.0

In [42]:
import joblib

# Save the trained model to a file
model_filename = 'disease_prediction_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

Model saved to disease_prediction_model.pkl


In [43]:
# Assuming symptoms_list is a list of symptoms
import json

# Save the symptoms list
symptoms_list = X.columns.tolist()  # Replace this with the actual list of symptoms if X is unavailable here
with open("symptoms_list.json", "w") as f:
    json.dump(symptoms_list, f)
