# We have to clean the data before we can use it in our model
So i will import the Symptom_precautions data and clean it

In [1]:
# Importing the libraries of python for data manipulation:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt

## Now reading the exact file Symptom_precautions.csv

In [2]:
data = pd.read_csv('../data/symptom_precaution.csv')
data.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


## Checking the random sample values in the dataset

In [3]:
data.sample(5)

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
34,Dengue,drink papaya leaf juice,avoid fatty spicy food,keep mosquitos away,keep hydrated
16,Dimorphic hemmorhoids(piles),avoid fatty spicy food,consume witch hazel,warm bath with epsom salt,consume alovera juice
39,Gastroenteritis,stop eating solid food for while,try taking small sips of water,rest,ease back into eating
40,Tuberculosis,cover mouth,consult doctor,medication,rest
22,Varicose veins,lie down flat and raise the leg high,use oinments,use vein compression,dont stand still for long


# Conclusion:
All the data is categorical

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease       41 non-null     object
 1   Precaution_1  41 non-null     object
 2   Precaution_2  41 non-null     object
 3   Precaution_3  40 non-null     object
 4   Precaution_4  40 non-null     object
dtypes: object(5)
memory usage: 1.7+ KB


## Printing  all the precautions for each disease

In [5]:
data

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths
5,GERD,avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise
6,Chronic cholestasis,cold baths,anti itch medicine,consult doctor,eat healthy
7,hepatitis A,Consult nearest hospital,wash hands through,avoid fatty spicy food,medication
8,Osteoarthristis,acetaminophen,consult nearest hospital,follow up,salt baths
9,(vertigo) Paroymsal Positional Vertigo,lie down,avoid sudden change in body,avoid abrupt head movment,relax


## Checking if there are any null values:

In [6]:
data.isnull().sum()

Disease         0
Precaution_1    0
Precaution_2    0
Precaution_3    1
Precaution_4    1
dtype: int64

# Conclusion:
There is one null value in Precaution_3 and Precaution_4 column

### Now we have to impute the null values in the Precaution_3 and Precaution_4 column using mode function as the data is categorical

In [7]:
data.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [None]:
# impute missing values in Precaution_3 and Precaution_4 columns using mode

# Clean the 'Disease' column
data["Disease"] = data["Disease"].str.strip().str.lower()

# Clean all precaution columns (remove extra spaces, lowercase)
for col in data.columns[1:]:
    data[col] = data[col].astype(str).str.strip().str.lower()
    data[col] = data[col].replace("nan", pd.NA)

# Impute missing values in Precaution_3 and Precaution_4 with their mode
for col in ["Precaution_3", "Precaution_4"]:
    mode_value = data[col].mode()[0]
    data[col] = data[col].fillna(mode_value)

# Preview result
data.head()


Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,drug reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,malaria,consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,allergy,apply calamine,cover area with bandage,consult doctor,use ice to compress itching
3,hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [11]:
data.isnull().sum()

Disease         0
Precaution_1    0
Precaution_2    0
Precaution_3    0
Precaution_4    0
dtype: int64

## Now all the null removed

In [14]:
# save the cleaned data to a new CSV file
data.to_csv("../data/Cleaned_Symptom_precautions.csv", index=False)