In [None]:
spark

In [None]:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._

In [None]:
%run "src/scala/register_tables.scala"

In [None]:
register_tables(spark, "spark-warehouse/")

In [None]:
val npr_raw = spark.read.option("header",true).csv("datasets/npr")

In [None]:
npr_raw.show(1)

In [None]:
val npr_red = npr_raw
    .drop("innmate", "pasfylke", "institusjon_navn", "omsorgsnivå" )
    // recast diffdager_ut and diffdager_inn to integer
    .withColumn("diffdager_inn2", $"diffdager_inn".cast("Int"))
        .drop("diffdager_inn").withColumnRenamed("diffdager_inn2","diffdager_inn")
    .withColumn("diffdager_ut2", $"diffdager_ut".cast("Int"))
        .drop("diffdager_ut").withColumnRenamed("diffdager_ut2","diffdager_ut")
npr_red.select("pasientlopenr")
    .count


In [None]:
npr_red.show(5)
npr_red.printSchema

In [None]:
npr_red.select("aldersgrp").groupBy("aldersgrp").count.sort(asc("aldersgrp")).show

In [None]:
// Take only løpenr from npr dataset
val npr_lopenr = npr_red.select("pasientlopenr")
    .where("pasientlopenr is not null")
    .groupBy("pasientlopenr")
    .count
    .select($"pasientlopenr", $"count".as("num_hospitalizations"))
    .distinct
//take all elders with hospitalizations
val elder_hosp_ids = spark.sql("select * from elders")
    .where("diff_utleveringdato is not null")
    .groupBy("id")
    .count
    .select($"id",$"count".as("n_prescriptions"))
    .distinct

In [None]:
println(npr_lopenr.count)
println(elder_hosp_ids.count)

#### We perform an inner join on the IDs(Løpenr) from the elders and NPR datasets, in order to include only hospitalizations of patients who are present in the elders set

In [None]:
val npr_from_elders = npr_lopenr.join(elder_hosp_ids)
    .where("pasientlopenr == id")
    .distinct
    .sort(desc("num_hospitalizations"))
    .drop("pasientlopenr")
    .select("id","num_hospitalizations","n_prescriptions")
npr_from_elders.show()

## Save table of hospitalizations with patients that exist in the elders dataset
* include num_hospitalizations and n_prescriptions

In [None]:
val npr_hospitalizations_from_elders = npr_from_elders.join(npr_red)
.where("pasientlopenr == id")

npr_hospitalizations_from_elders.write.mode(SaveMode.Overwrite).parquet("spark-warehouse/npr_elders")

## select patients with less than 5 hospitalizations
### Note that these are all patients also in the elders dataset
* 72238 patients in

In [None]:
npr_lopenr.where(col("num_hospitalizations")<=5).count