# Analysis 2-1: Data selection

In [1]:
spark

Intitializing Scala interpreter ...

Spark Web UI available at http://localhost:4041
SparkContext available as 'sc' (version = 2.3.2, master = local[*], app id = local-1556274831546)
SparkSession available as 'spark'


res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@602e2208


In [2]:
//%load src/scala/spark_imports.scala

In [3]:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._


In [4]:
%run src/scala/register_tables.scala
register_tables(spark, "spark-warehouse/")

===Prescription dataset===
registering patients
registering prescriptions
registering drugs
ok!
===Hospitalization dataset===
registering all
registering patients
registering prescriptions
registering drugs
ok!
===NPR Elders===
ok!
+--------+--------------------------+-----------+
|database|tableName                 |isTemporary|
+--------+--------------------------+-----------+
|        |elders                    |true       |
|        |elders_drugs              |true       |
|        |elders_patients           |true       |
|        |elders_prescriptions      |true       |
|        |npr_elders                |true       |
|        |prescription_drugs        |true       |
|        |prescription_patients     |true       |
|        |prescription_prescriptions|true       |
+--------+--------------------------+-----------+

done!


In [5]:
val elders = spark.sql("select * from elders")

elders: org.apache.spark.sql.DataFrame = [id: string, birthyear: int ... 21 more fields]


## Split the data by gender and whether they have hospitalizations or not

In [6]:
val hospitalized = elders.where("diff_utleveringdato is not null")
val unhospitalized = elders
    .where("diff_utleveringdato is null")
    .where("death_timestamp is null")

val npr = spark.sql("select * from npr_elders")
val npr_ids = npr.select("id")
    .withColumnRenamed("id","id2")
    .distinct
//npr_ids.printSchema




hospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
unhospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
npr: org.apache.spark.sql.DataFrame = [id: string, num_hospitalizations: bigint ... 10 more fields]
npr_ids: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id2: string]


## Hospitalized patients: Join with NPR (Hospitalization register) So that we get the set of IDS: 
$Hosp.IDs \cap NPR.IDs$

For both genders

In [7]:
%run src/scala/data_selection.scala

Registered functions: unhospitalized_selector, hospitalized_selector,  npr_selector
ok


## Male

In [8]:
val male_hospitalized     = hospitalized.where('gender === 1)
val male_unhospitalized   = unhospitalized.where('gender === 1)

val male_hosp_ids = male_hospitalized
    .select("id")
    .distinct
    .join(npr_ids, 'id==='id2).drop("id")

val eligible_male_hosp_prescriptions = hospitalized_selector(
        male_hospitalized
            .join(male_hosp_ids, 'id === 'id2)
    )
val eligible_male_hosp_npr           = npr_selector(
        npr.join(male_hosp_ids, 'id === 'id2)
    )

val male_hosp_patients_count   = male_hospitalized.select('id).distinct.count
val male_after_filtering_count   = male_hosp_ids.count

male_hospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
male_unhospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
male_hosp_ids: org.apache.spark.sql.DataFrame = [id2: string]
eligible_male_hosp_prescriptions: org.apache.spark.sql.DataFrame = [id: string, birthyear: int ... 3 more fields]
eligible_male_hosp_npr: org.apache.spark.sql.DataFrame = [diffdager_inn: int, diffdager_ut: int ... 1 more field]
male_hosp_patients_count: Long = 80833
male_after_filtering_count: Long = 80833


## Female

In [9]:
val female_hospitalized   = hospitalized.where('gender === 2)
val female_unhospitalized = unhospitalized.where('gender === 2)

val female_hosp_ids = female_hospitalized
    .select("id")
    .distinct
    .join(npr_ids, 'id==='id2).drop("id")

val eligible_female_hosp_prescriptions = hospitalized_selector(
        female_hospitalized
            .join(female_hosp_ids, 'id === 'id2)
    )
val eligible_female_hosp_npr           = npr_selector(
        npr.join(female_hosp_ids, 'id === 'id2)
    )

val female_hosp_patients_count = female_hospitalized.select('id).distinct.count 
val female_after_filtering_count = female_hosp_ids.count


female_hospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
female_unhospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
female_hosp_ids: org.apache.spark.sql.DataFrame = [id2: string]
eligible_female_hosp_prescriptions: org.apache.spark.sql.DataFrame = [id: string, birthyear: int ... 3 more fields]
eligible_female_hosp_npr: org.apache.spark.sql.DataFrame = [diffdager_inn: int, diffdager_ut: int ... 1 more field]
female_hosp_patients_count: Long = 91595
female_after_filtering_count: Long = 91595


### Save hospitalization datasets

In [10]:
val savedir = "spark-warehouse/experiment2-data/"

savedir: String = spark-warehouse/experiment2-data/


In [11]:
eligible_female_hosp_npr
    .write.mode(SaveMode.Overwrite).parquet(savedir+"female/hosp_npr")
eligible_female_hosp_prescriptions
    .write.mode(SaveMode.Overwrite).parquet(savedir+"female/hosp_pres")

eligible_male_hosp_npr
    .write.mode(SaveMode.Overwrite).parquet(savedir+"male/hosp_npr")
//This definitely wasn't the root of all evil in my analyses
//eligible_female_hosp_prescriptions
eligible_male_hosp_prescriptions
    .write.mode(SaveMode.Overwrite).parquet(savedir+"male/hosp_pres")
println("ok")

ok


### Save Unhospitalized sets

In [12]:
unhospitalized_selector(male_unhospitalized)
    .write.mode(SaveMode.Overwrite).parquet(savedir+"male/unhosp_pres")
unhospitalized_selector(female_unhospitalized)
    .write.mode(SaveMode.Overwrite).parquet(savedir+"female/unhosp_pres")
println("ok")

ok


## Convert elders active drug dataset

In [13]:
val elders_active_drugs = spark.read
    .option("header","true")
    .csv("datasets/elders_drug_duration").drop("_c0")
    .select(
        $"pasientlopenr".as("id")
        ,$"atckode".as("drugcode")
        ,$"treatment_start".cast("int")
        ,$"treatment_end".cast("int")
    )

elders_active_drugs
    .repartition($"id")
    .write
    .mode(SaveMode.Overwrite)    
    .parquet("spark-warehouse/experiment2-data/elders_drug_duration")
println("ok")

ok


elders_active_drugs: org.apache.spark.sql.DataFrame = [id: string, drugcode: string ... 2 more fields]
