In [0]:
from pyspark.sql.functions import col, when

# Try with catalog.schema first
df_raw = spark.table("final_exam_cluster.default.raw_healthcare")
df_raw.show(5)
df_raw.printSchema()


+----------+---+------+--------------------+-------------+----------------+
|Patient_ID|Age|Gender|            Symptoms|Symptom_Count|         Disease|
+----------+---+------+--------------------+-------------+----------------+
|         1| 29|  Male|fever, back pain,...|            3|         Allergy|
|         2| 76|Female|insomnia, back pa...|            3|Thyroid Disorder|
|         3| 78|  Male|sore throat, vomi...|            3|       Influenza|
|         4| 58| Other|blurred vision, d...|            4|          Stroke|
|         5| 55|Female|swelling, appetit...|            3|   Heart Disease|
+----------+---+------+--------------------+-------------+----------------+
only showing top 5 rows
root
 |-- Patient_ID: long (nullable = true)
 |-- Age: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Symptoms: string (nullable = true)
 |-- Symptom_Count: long (nullable = true)
 |-- Disease: string (nullable = true)



In [0]:
from pyspark.sql.functions import col, when

# 1) Ensure numeric types
df_clean = (
    df_raw
    .withColumn("Age", col("Age").cast("int"))
    .withColumn("Symptom_count", col("Symptom_count").cast("int"))
)

# 2) Create age_group from Age
df_clean = df_clean.withColumn(
    "age_group",
    when(col("Age") < 18, "Child")
    .when(col("Age") < 40, "Adult")
    .when(col("Age") < 65, "Middle-aged")
    .otherwise("Senior")
)

# 3) Create severity_level from Symptom_count
df_clean = df_clean.withColumn(
    "severity_level",
    when(col("Symptom_count") <= 2, "Mild")
    .when(col("Symptom_count") <= 5, "Moderate")
    .otherwise("Severe")
)

df_clean.show(5)
df_clean.printSchema()

+----------+---+------+--------------------+-------------+----------------+-----------+--------------+
|Patient_ID|Age|Gender|            Symptoms|Symptom_count|         Disease|  age_group|severity_level|
+----------+---+------+--------------------+-------------+----------------+-----------+--------------+
|         1| 29|  Male|fever, back pain,...|            3|         Allergy|      Adult|      Moderate|
|         2| 76|Female|insomnia, back pa...|            3|Thyroid Disorder|     Senior|      Moderate|
|         3| 78|  Male|sore throat, vomi...|            3|       Influenza|     Senior|      Moderate|
|         4| 58| Other|blurred vision, d...|            4|          Stroke|Middle-aged|      Moderate|
|         5| 55|Female|swelling, appetit...|            3|   Heart Disease|Middle-aged|      Moderate|
+----------+---+------+--------------------+-------------+----------------+-----------+--------------+
only showing top 5 rows
root
 |-- Patient_ID: long (nullable = true)
 |--

In [0]:
df_clean.write.mode("overwrite").saveAsTable("final_exam_cluster.default.clean_healthcare")