In [1]:
!pip install pandas scikit-learn openpyxl



In [2]:
import pandas as pd
df = pd.read_excel("cleaned_combined_disease_data (1) (1).xlsx")
df.head()

Unnamed: 0,disease,symptoms,precautions,specialist
0,(vertigo) Paroymsal Positional Vertigo,"vomiting,headache,nausea,spinningmovements,los...","lie down,avoid sudden change in body,avoid abr...",General Physician
1,AIDS,"musclewasting,patchesinthroat,highfever,extram...","avoid open cuts,wear ppe if possible,consult d...",General Physician
2,Acne,"skinrash,pusfilledpimples,blackheads,scurring","bath twice,avoid fatty spicy food,drink plenty...",Dermatologist
3,Alcoholic hepatitis,"vomiting,yellowishskin,abdominalpain,swellingo...","stop alcohol consumption,consult doctor,medica...",Hepatologist
4,Allergy,"continuoussneezing,shivering,chills,wateringfr...","apply calamine,cover area with bandage,use ice...",Allergist


In [3]:
df['symptoms'] = df['symptoms'].apply(lambda x: x.lower().replace(" ", "").split(','))

KeyError: 'symptoms'

In [4]:
print(df.columns)

Index(['disease',
       '                                                                                             symptoms',
       'precautions', 'specialist'],
      dtype='object')


In [5]:
df.columns = df.columns.str.strip().str.lower()

In [6]:
df['symptoms'] = df['symptoms'].apply(lambda x: x.lower().replace(" ", "").split(','))

In [7]:
df['precautions'] = df['precautions'].apply(lambda x: x.lower().split(','))

In [8]:
df.head()

Unnamed: 0,disease,symptoms,precautions,specialist
0,(vertigo) Paroymsal Positional Vertigo,"[vomiting, headache, nausea, spinningmovements...","[lie down, avoid sudden change in body, avoid ...",General Physician
1,AIDS,"[musclewasting, patchesinthroat, highfever, ex...","[avoid open cuts, wear ppe if possible, consul...",General Physician
2,Acne,"[skinrash, pusfilledpimples, blackheads, scurr...","[bath twice, avoid fatty spicy food, drink ple...",Dermatologist
3,Alcoholic hepatitis,"[vomiting, yellowishskin, abdominalpain, swell...","[stop alcohol consumption, consult doctor, med...",Hepatologist
4,Allergy,"[continuoussneezing, shivering, chills, wateri...","[apply calamine, cover area with bandage, use ...",Allergist


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['symptoms'])  # input: encoded symptoms
y = df['disease']                      # output: disease name


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

print("✅ Model Trained")
print("🎯 Accuracy on Test Data:", model.score(X_test, y_test))


✅ Model Trained
🎯 Accuracy on Test Data: 0.0


In [11]:
def predict_disease(input_symptoms):
    input_symptoms = [sym.lower().replace(" ", "") for sym in input_symptoms]

    if len(input_symptoms) < 3:
        return "⚠ Please enter at least 3 symptoms."

    encoded = mlb.transform([input_symptoms])
    predicted = model.predict(encoded)[0]

    row = df[df['disease'] == predicted].iloc[0]

    return {
        "Predicted Disease": predicted,
        "Precautions": row['precautions'],
        "Specialist": row['specialist']
    }

# 🔍 Example
predict_disease(['fever', 'itchy rash', 'fatigue'])




{'Predicted Disease': 'Sneeze',
 'Precautions': ['avoid allergens', 'consult doctor'],
 'Specialist': 'General Physician'}

In [12]:
predict_disease(['fever'])

'⚠ Please enter at least 3 symptoms.'

In [13]:
predict_disease(['fever','headache','breathlessness'])

{'Predicted Disease': 'Viral Fever',
 'Precautions': ['rest', 'hydration', 'paracetamol'],
 'Specialist': 'General Physician'}

In [14]:
print("Total diseases:", df['disease'].nunique())
print("Total rows:", len(df))

Total diseases: 47
Total rows: 47


In [15]:
# Duplicate the data 3 times
df_aug = pd.concat([df]*3, ignore_index=True)

# Shuffle the dataset
df_aug = df_aug.sample(frac=1, random_state=42).reset_index(drop=True)


In [16]:
# Re-encode symptoms
X = mlb.fit_transform(df_aug['symptoms'])
y = df_aug['disease']

# Use stratify now that we have repeats
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Accuracy
print("🎯 New Accuracy:", model.score(X_test, y_test))


ValueError: The test_size = 29 should be greater or equal to the number of classes = 47

In [17]:
df_aug = pd.concat([df]*4, ignore_index=True)
df_aug = df_aug.sample(frac=1, random_state=42).reset_index(drop=True)

# Encode + train-test split
X = mlb.fit_transform(df_aug['symptoms'])
y = df_aug['disease']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)
print("🎯 Accuracy:", model.score(X_test, y_test))


ValueError: The test_size = 38 should be greater or equal to the number of classes = 47

In [18]:
df_aug['disease'].value_counts()

disease
Typhoid                                    4
Hepatitis B                                4
Gastroenteritis                            4
Drug Reaction                              4
Heart Attack                               4
Dimorphic hemmorhoids(piles)               4
Viral Fever                                4
Impetigo                                   4
Fungal infection                           4
Arthritis                                  4
Diarrhea                                   4
Chicken pox                                4
Headache                                   4
GERD                                       4
Jaundice                                   4
Hypertension                               4
Hepatitis C                                4
Measles                                    4
Allergy                                    4
Peptic ulcer diseae                        4
Hepatitis E                                4
Acne                                       4
Ch

In [19]:
# Encode input features
X = mlb.fit_transform(df_aug['symptoms'])
y = df_aug['disease']

# Split with stratification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train RandomForest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Check Accuracy
print("🎯 Accuracy on Test Set:", model.score(X_test, y_test))


ValueError: The test_size = 38 should be greater or equal to the number of classes = 47

In [20]:
# Ensure each disease has 4 rows using group duplication
df_expanded = pd.concat([df[df['disease'] == disease]] * 4 for disease in df['disease'].unique())
df_expanded = pd.concat(df_expanded, ignore_index=True)

# Shuffle
df_expanded = df_expanded.sample(frac=1, random_state=42).reset_index(drop=True)


TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [21]:
df_expanded = pd.concat(
    [df[df['disease'] == disease].copy().reset_index(drop=True)] * 4
    for disease in df['disease'].unique()
).reset_index(drop=True)


TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [22]:
dfs = []

for disease in df['disease'].unique():
    group = df[df['disease'] == disease]
    dfs.append(pd.concat([group]*4, ignore_index=True))  # Repeat each disease 4 times

df_expanded = pd.concat(dfs, ignore_index=True)
df_expanded = df_expanded.sample(frac=1, random_state=42).reset_index(drop=True)


In [23]:
X = mlb.fit_transform(df_expanded['symptoms'])
y = df_expanded['disease']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

print("🎯 Accuracy:", model.score(X_test, y_test))


ValueError: The test_size = 38 should be greater or equal to the number of classes = 47

In [24]:
dfs = []

for disease in df['disease'].unique():
    group = df[df['disease'] == disease]
    dfs.append(pd.concat([group]*5, ignore_index=True))  # Repeat 5 times now

df_expanded = pd.concat(dfs, ignore_index=True)
df_expanded = df_expanded.sample(frac=1, random_state=42).reset_index(drop=True)

# Double check
print("✅ Total rows:", len(df_expanded))
print("✅ Unique diseases:", df_expanded['disease'].nunique())


✅ Total rows: 235
✅ Unique diseases: 47


In [25]:
X = mlb.fit_transform(df_expanded['symptoms'])
y = df_expanded['disease']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = RandomForestClassifier()
model.fit(X_train, y_train)

print("🎯 Accuracy:", model.score(X_test, y_test))


🎯 Accuracy: 1.0


In [26]:
def predict_disease(symptom_list):
    # Preprocess input symptoms
    input_symptoms = [s.strip().lower().replace(" ", "") for s in symptom_list]

    # Check minimum 3 symptoms
    if len(input_symptoms) < 3:
        return "⚠️ Please provide at least 3 symptoms."

    # Transform input using same MultiLabelBinarizer
    encoded_input = mlb.transform([input_symptoms])
    
    # Predict
    prediction = model.predict(encoded_input)[0]

    # Fetch precautions & specialist
    info = df[df['disease'] == prediction].iloc[0]
    return {
        "🦠 Predicted Disease": prediction,
        "💊 Precautions": info['precautions'].split(';'),
        "👨‍⚕️ Doctor Type": info['specialist']
    }


In [27]:
predict_disease(['vomiting', 'headache', 'nausea'])

AttributeError: 'list' object has no attribute 'split'

In [28]:
def predict_disease(symptom_list):
    # Preprocess input symptoms
    input_symptoms = [s.strip().lower().replace(" ", "") for s in symptom_list]

    # Check minimum 3 symptoms
    if len(input_symptoms) < 3:
        return "⚠️ Please provide at least 3 symptoms."

    # Transform input
    encoded_input = mlb.transform([input_symptoms])

    # Predict
    prediction = model.predict(encoded_input)[0]

    # Get one matching row from expanded dataframe
    info = df_expanded[df_expanded['disease'] == prediction].iloc[0]

    # Split precautions safely (check if it's string)
    precautions = info['precautions']
    if isinstance(precautions, str):
        precautions = precautions.split(';')
    
    return {
        "🦠 Predicted Disease": prediction,
        "💊 Precautions": precautions,
        "👨‍⚕️ Doctor Type": info['specialist']
    }


In [29]:
predict_disease(['vomiting', 'headache', 'nausea'])

{'🦠 Predicted Disease': 'Headache',
 '💊 Precautions': ['rest', 'analgesics'],
 '👨\u200d⚕️ Doctor Type': 'Neurologist'}

In [30]:
predict_disease(['fever', 'headache', 'nausea'])

{'🦠 Predicted Disease': 'Viral Fever',
 '💊 Precautions': ['rest', 'hydration', 'paracetamol'],
 '👨\u200d⚕️ Doctor Type': 'General Physician'}

In [31]:
predict_disease(['fever', 'bodypain', 'breadthlessness'])



{'🦠 Predicted Disease': 'Viral Fever',
 '💊 Precautions': ['rest', 'hydration', 'paracetamol'],
 '👨\u200d⚕️ Doctor Type': 'General Physician'}

In [32]:
predict_disease(['fever', 'bodypain', 'breathlessness'])

{'🦠 Predicted Disease': 'Viral Fever',
 '💊 Precautions': ['rest', 'hydration', 'paracetamol'],
 '👨\u200d⚕️ Doctor Type': 'General Physician'}

In [33]:
# Get all known symptoms from training
known_symptoms = mlb.classes_.tolist()

def clean_symptoms(symptoms):
    return [
        s.strip().lower().replace(" ", "")
        for s in symptoms if s.strip().lower().replace(" ", "") in known_symptoms
    ]

In [34]:
def predict_disease(symptom_list):
    cleaned = clean_symptoms(symptom_list)

    if len(cleaned) < 3:
        return "⚠️ Please enter at least 3 valid known symptoms."

    encoded_input = mlb.transform([cleaned])
    prediction = model.predict(encoded_input)[0]

    info = df_expanded[df_expanded['disease'] == prediction].iloc[0]
    precautions = info['precautions']
    if isinstance(precautions, str):
        precautions = precautions.split(';')
    
    return {
        "🦠 Predicted Disease": prediction,
        "💊 Precautions": precautions,
        "👨‍⚕️ Doctor Type": info['specialist']
    }


In [35]:
predict_disease(['fever', 'bodypain', 'breadthlessness'])

'⚠️ Please enter at least 3 valid known symptoms.'

In [36]:
predict_disease(['fever', 'bodypain', 'breathlessness'])

{'🦠 Predicted Disease': 'Viral Fever',
 '💊 Precautions': ['rest', 'hydration', 'paracetamol'],
 '👨\u200d⚕️ Doctor Type': 'General Physician'}