In [None]:
#importing libraries and setting up NLTK
import pandas as pd
import numpy as np
import re
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')


In [None]:
#loading dataset
df = pd.read_csv("../data/Healthcare.csv")
df.head()


In [None]:
#Encode disease labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Disease"])

df[["Disease", "label"]].head()


In [None]:
#Core NLP cleaning pipeline
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove numbers and punctuation
    words = text.split()
    words = [w for w in words if w not in stop_words]  # remove stopwords
    words = [ps.stem(w) for w in words]  # stemming
    return " ".join(words)

df['clean_symptoms'] = df['Symptoms'].apply(clean_text)
df[['Symptoms', 'clean_symptoms']].head()


In [None]:
# Save the label encoder for later
joblib.dump(label_encoder, "../data/processed/label_encoder.pkl")


In [None]:
# Keep all relevant columns and save cleaned dataset to processed folder
df_full = df[['clean_symptoms', 'Age', 'Gender', 'Symptom_Count', 'label']].copy()
df_full.to_csv("../data/processed/clean_dataset_full.csv", index=False)

df_full.head()
