In [4]:


import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import os


dataset_path = kagglehub.dataset_download("itachi9604/disease-symptom-description-dataset")
print("Path to dataset files:", dataset_path)


print("Files in the dataset directory:")
for file in os.listdir(dataset_path):
    print(file)


csv_file = os.path.join(dataset_path, "dataset.csv")
df = pd.read_csv(csv_file)
print("Dataset sample:")
print(df.head())


print("Column names:", df.columns)


le = LabelEncoder()
df['disease_label'] = le.fit_transform(df['Disease'])


symptom_cols = [col for col in df.columns if 'Symptom_' in col]
df['Symptoms_combined'] = df[symptom_cols].fillna('').agg(','.join, axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    df['Symptoms_combined'], df['disease_label'], test_size=0.2, random_state=42
)


vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=500)
clf.fit(X_train_vec, y_train)


y_pred = clf.predict(X_test_vec)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


def predict_disease(symptom_text):
    vec = vectorizer.transform([symptom_text])
    pred_label = clf.predict(vec)[0]
    disease_name = le.inverse_transform([pred_label])[0]


    description_df = pd.read_csv(os.path.join(dataset_path, 'symptom_Description.csv'))
    description = description_df[description_df['Disease'] == disease_name]['Description'].values[0]

    return disease_name, description


symptom_input = "fever, headache, cough"
disease, desc = predict_disease(symptom_input)
print("\n--- Prediction ---")
print("Input Symptoms:", symptom_input)
print("Predicted Disease:", disease)
print("Description:", desc)


examples = [
    "fatigue, nausea, yellow skin",
    "joint pain, swelling, rash"
]

for s in examples:
    disease, desc = predict_disease(s)
    print("\nSymptoms:", s)
    print("Predicted Disease:", disease)
    print("Description:", desc)

Using Colab cache for faster access to the 'disease-symptom-description-dataset' dataset.
Path to dataset files: /kaggle/input/disease-symptom-description-dataset
Files in the dataset directory:
symptom_Description.csv
Symptom-severity.csv
symptom_precaution.csv
dataset.csv
Dataset sample:
            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN     