# Silver: deaths
Cleans and transforms himalaya.bronze.deaths into himalaya.silver.deaths

## Transformations
- Cast `date` to DateType
- Standardise `nationality` 
- Consolidate `cause_of_death` categories
- Drop null `nationality` rows if not found in other datasets

##Table Overview

In [0]:
import pyspark.sql.functions as F
from datetime import date

In [0]:
df = spark.table("himalaya.bronze.deaths")

In [0]:
df.show(5)

> ##Date

In [0]:
df = df.withColumn("date", F.to_date(F.col("date")))
df.printSchema()

In [0]:
#show date column
df.select("date").show(5)

> ##Name Consistency

In [0]:
df = df.withColumn("name", F.initcap(F.trim(F.col("name"))))
df.select("name").show(5)

> ##Nationality

In [0]:
df = df.drop("nationality")
display(df.limit(3))

> ##Cause of Death

In [0]:
display(df.select("cause_of_death").distinct().orderBy("cause_of_death"))

In [0]:
df = df.withColumn("cause_of_death_category",
    F.when(F.col("cause_of_death").rlike("(?i)avalanche"), "Avalanche")
    .when(F.col("cause_of_death").rlike("(?i)serac|rock|stone"), "Falling Ice/Rock")
    .when(F.col("cause_of_death").rlike("(?i)fall|fell|crevasse|splipped"), "Fall")
    .when(F.col("cause_of_death").rlike("(?i)edema|oedema|HACE|HAPE|cerebral|pulmonary"), "Edema")
    .when(F.col("cause_of_death").rlike("(?i)altitude|mountain sickness"), "Altitude Sickness")
    .when(F.col("cause_of_death").rlike("(?i)exhaustion"), "Exhaustion")
    .when(F.col("cause_of_death").rlike("(?i)exposure|hypothermia|frostbite|cold|froze|freezing"), "Hypothermia")
    .when(F.col("cause_of_death").rlike("(?i)heart|cardiac|stroke|brain|hemorrhage"), "Medical/Cardiac")
    .when(F.col("cause_of_death").rlike("(?i)storm|weather|lightning|wind"), "Weather")
    .when(F.col("cause_of_death").rlike("(?i)disappear"), "Disappeared")
    .when(F.col("cause_of_death").rlike("(?i)unknown|unclear|unspecified"), "Unknown")
    .when(F.col("cause_of_death").isNull(), "Unknown")
    .otherwise("Other")
)

display(df.select("cause_of_death", "cause_of_death_category").distinct())

In [0]:
df = df.withColumn("cause_of_death_category",
    F.when(F.col("cause_of_death").rlike("(?i)avalanche"), "Avalanche")
    .when(F.col("cause_of_death").rlike("(?i)serac|rock|stone"), "Falling Ice/Rock")
    .when(F.col("cause_of_death").rlike("(?i)fall|fell|crevasse|splipped"), "Fall")
    .when(F.col("cause_of_death").rlike("(?i)edema|oedema|HACE|HAPE|cerebral|pulmonary"), "Edema")
    .when(F.col("cause_of_death").rlike("(?i)altitude|mountain sickness"), "Altitude Sickness")
    .when(F.col("cause_of_death").rlike("(?i)exhaustion"), "Exhaustion")
    .when(F.col("cause_of_death").rlike("(?i)exposure|hypothermia|frostbite|cold|froze|freezing"), "Hypothermia")
    .when(F.col("cause_of_death").rlike("(?i)heart|cardiac|stroke|brain|hemorrhage"), "Medical/Cardiac")
    .when(F.col("cause_of_death").rlike("(?i)storm|weather|lightning|wind"), "Weather")
    .when(F.col("cause_of_death").rlike("(?i)disappear"), "Disappeared")
    .when(F.col("cause_of_death").rlike("(?i)unknown|unclear|unspecified"), "Unknown")
    .when(F.col("cause_of_death").isNull(), "Unknown")
    .otherwise("Other")
).drop("cause_of_death")

In [0]:
df = df.select("date", "name", "cause_of_death_category", "mountain", "ingested_at")

In [0]:
display(df)

> ##Mountain

In [0]:
display(df.select("mountain").distinct())

> ## Silver Transfer

In [0]:
df.write.format("delta").mode("overwrite").saveAsTable("himalaya.silver.deaths")

In [0]:
display(spark.table("himalaya.silver.deaths").limit(5))