In [1]:
#importing libraries and setting up NLTK
import pandas as pd
import numpy as np
import re
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#loading dataset
df = pd.read_csv("../data/Healthcare.csv")
df.head()


Unnamed: 0,Patient_ID,Age,Gender,Symptoms,Symptom_Count,Disease
0,1,29,Male,"fever, back pain, shortness of breath",3,Allergy
1,2,76,Female,"insomnia, back pain, weight loss",3,Thyroid Disorder
2,3,78,Male,"sore throat, vomiting, diarrhea",3,Influenza
3,4,58,Other,"blurred vision, depression, weight loss, muscl...",4,Stroke
4,5,55,Female,"swelling, appetite loss, nausea",3,Heart Disease


In [3]:
#Encode disease labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Disease"])

df[["Disease", "label"]].head()


Unnamed: 0,Disease,label
0,Allergy,0
1,Thyroid Disorder,27
2,Influenza,19
3,Stroke,26
4,Heart Disease,16


In [4]:
#Core NLP cleaning pipeline
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove numbers and punctuation
    words = text.split()
    words = [w for w in words if w not in stop_words]  # remove stopwords
    words = [ps.stem(w) for w in words]  # stemming
    return " ".join(words)

df['clean_symptoms'] = df['Symptoms'].apply(clean_text)
df[['Symptoms', 'clean_symptoms']].head()


Unnamed: 0,Symptoms,clean_symptoms
0,"fever, back pain, shortness of breath",fever back pain short breath
1,"insomnia, back pain, weight loss",insomnia back pain weight loss
2,"sore throat, vomiting, diarrhea",sore throat vomit diarrhea
3,"blurred vision, depression, weight loss, muscl...",blur vision depress weight loss muscl pain
4,"swelling, appetite loss, nausea",swell appetit loss nausea


In [5]:
# Save the label encoder for later
joblib.dump(label_encoder, "../data/processed/label_encoder.pkl")


['../data/processed/label_encoder.pkl']

In [6]:
# Keep all relevant columns and save cleaned dataset to processed folder
df_full = df[['clean_symptoms', 'Age', 'Gender', 'Symptom_Count', 'label']].copy()
df_full.to_csv("../data/processed/clean_dataset_full.csv", index=False)

df_full.head()


Unnamed: 0,clean_symptoms,Age,Gender,Symptom_Count,label
0,fever back pain short breath,29,Male,3,0
1,insomnia back pain weight loss,76,Female,3,27
2,sore throat vomit diarrhea,78,Male,3,19
3,blur vision depress weight loss muscl pain,58,Other,4,26
4,swell appetit loss nausea,55,Female,3,16
