# **Disease Prediction and Precautions**

In [None]:
# Importing all the necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [8]:
symptoms = pd.read_csv("../data/DiseaseAndSymptoms.csv")
precautions = pd.read_csv("../data/Disease precaution.csv")
symptoms.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [9]:
precautions.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


## **Cleaning the Files first**

In [10]:
# Checking for the null values in the symptoms
symptoms.isnull().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [11]:
print(symptoms["Disease"].unique())

['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo']


# Task
Handle missing values in the dataset "Disease_Prediction.xlsx".

## Handle missing values

### Subtask:
Address the missing values in the symptom columns.


**Reasoning**:
Fill the missing values in the symptom columns with 'None' as a placeholder and then verify that the missing values are handled.



In [12]:
symptom_cols = [col for col in symptoms.columns if 'Symptom_' in col]
symptoms[symptom_cols] = symptoms[symptom_cols].fillna('None')
print(symptoms[symptom_cols].isnull().sum())

Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64


## Data preprocessing

### Subtask:
Prepare the data for model training, which may include encoding categorical features and splitting the data.


**Reasoning**:
Separate features and target, apply one-hot encoding to symptom columns, and split the data into training and testing sets with stratification.



In [None]:
# Separate features and target
X = symptoms.drop('Disease', axis=1)
y = symptoms['Disease']

# Apply one-hot encoding to symptom columns
X_encoded = pd.get_dummies(X, prefix=symptom_cols)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)

print("Shape of original features:", X.shape)
print("Shape of encoded features:", X_encoded.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of original features: (4920, 17)
Shape of encoded features: (4920, 408)
Shape of X_train: (3936, 408)
Shape of X_test: (984, 408)
Shape of y_train: (3936,)
Shape of y_test: (984,)


## Model selection and training

### Subtask:
Choose a suitable model for disease prediction and train it on the preprocessed data.


**Reasoning**:
Import and train a RandomForestClassifier model on the training data.



In [None]:
# Instantiate the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

## Model evaluation

### Subtask:
Evaluate the performance of the trained model.


**Reasoning**:
Evaluate the performance of the trained model by calculating the accuracy score on the test set.



In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print(f"Accuracy of the model: {accuracy}")

Accuracy of the model: 1.0


## Prediction

### Subtask:
Use the trained model to predict diseases based on given symptoms.


**Reasoning**:
Create a DataFrame for the input symptoms, ensure it has the same columns as X_encoded, and then use the trained model to make a prediction.



In [16]:
# Create a list of input symptoms. Replace with the actual symptoms you want to predict on.
input_symptoms = ['itching', 'skin_rash', 'nodal_skin_eruptions', 'dischromic _patches']

# Create a dictionary to hold the symptom data
input_data = {}
for col in X_encoded.columns:
    # Extract the base symptom name from the column name
    base_symptom = '_'.join(col.split('_')[1:])

    # Check if the base symptom is in the input symptoms list and set the value accordingly
    if base_symptom in input_symptoms:
        input_data[col] = [True]
    else:
        input_data[col] = [False]

# Create a DataFrame from the input data
input_df = pd.DataFrame(input_data)

# Ensure the columns are in the same order as X_encoded
input_df = input_df[X_encoded.columns]

# Predict the disease
predicted_disease = model.predict(input_df)

# Print the predicted disease
print(f"Based on the symptoms, the predicted disease is: {predicted_disease[0]}")

Based on the symptoms, the predicted disease is: Common Cold


## Integrate precautions

### Subtask:
Combine the disease predictions with the precaution data to provide relevant precautions.


**Reasoning**:
Access the predicted disease, filter the precautions dataframe to find the row with the matching disease, extract the precaution columns for that disease, store them in a list, and print the predicted disease and precautions.



In [17]:
# Access the predicted disease from the previous step
# predicted_disease is already available from the previous step

# Filter the precautions DataFrame
disease_precautions = precautions[precautions['Disease'] == predicted_disease[0]]

# Extract the precaution columns
precaution_cols = [col for col in precautions.columns if 'Precaution_' in col]
precautions_list = disease_precautions[precaution_cols].values.tolist()[0]

# Remove NaN values from the precautions list
precautions_list = [p for p in precautions_list if pd.notna(p)]

# Print the predicted disease and precautions
print(f"\nPrecautions for {predicted_disease[0]}:")
for i, precaution in enumerate(precautions_list):
    print(f"Precaution {i+1}: {precaution}")


Precautions for Common Cold:
Precaution 1: drink vitamin c rich drinks
Precaution 2: take vapour
Precaution 3: avoid cold food
Precaution 4: keep fever in check
