# Analysis 1-1: Preparing data for analysis with python

## Load elders dataset

In [1]:
spark

Intitializing Scala interpreter ...

Spark Web UI available at http://localhost:4040
SparkContext available as 'sc' (version = 2.3.2, master = local[*], app id = local-1558352204896)
SparkSession available as 'spark'


res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5137dcd5


In [2]:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._


In [3]:
%run "src/scala/register_tables.scala"

In [4]:
register_tables(spark, "spark-warehouse/")

===Prescription dataset===
registering patients
registering prescriptions
registering drugs
ok!
===Hospitalization dataset===
registering all
registering patients
registering prescriptions
registering drugs
ok!
===NPR Elders===
ok!
+--------+--------------------------+-----------+
|database|tableName                 |isTemporary|
+--------+--------------------------+-----------+
|        |elders                    |true       |
|        |elders_drugs              |true       |
|        |elders_patients           |true       |
|        |elders_prescriptions      |true       |
|        |npr_elders                |true       |
|        |prescription_drugs        |true       |
|        |prescription_patients     |true       |
|        |prescription_prescriptions|true       |
+--------+--------------------------+-----------+

done!


In [5]:
val elders = spark.sql("select * from elders")

elders: org.apache.spark.sql.DataFrame = [id: string, birthyear: int ... 21 more fields]


In [6]:
elders.printSchema

root
 |-- id: string (nullable = true)
 |-- birthyear: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- fylke_id: integer (nullable = true)
 |-- fylke_name: integer (nullable = true)
 |-- drugcode: string (nullable = true)
 |-- DDD_value: string (nullable = true)
 |-- DDD_unit: string (nullable = true)
 |-- VareNavn: string (nullable = true)
 |-- prescription_year: timestamp (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- Diff_UtleveringDato: integer (nullable = true)
 |-- prescriber_id: string (nullable = true)
 |-- prescriber_birthyear: integer (nullable = true)
 |-- prescriber_gender: integer (nullable = true)
 |-- prescriber_no_id: integer (nullable = true)
 |-- Hjemmel: string (nullable = true)
 |-- Hjemmelnr: integer (nullable = true)
 |-- Kategori: string (nullable = true)
 |-- KategoriNr: integer (nullable = true)
 |-- OrdinasjonAntallPakninger: float (nullable = true)
 |-- OrdinasjonAntallDDD: float (nullable = true)
 |-- death_timestamp

In [10]:
val hospitalized = elders.where("diff_utleveringdato is not null")
val unhospitalized = elders.where("diff_utleveringdato is null")
val unhospitalized_alive = unhospitalized.where("death_timestamp is null")


hospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
unhospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]
unhospitalized_alive: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, birthyear: int ... 21 more fields]


In [11]:
println("prescriptions of dead unhospitalized patients = ~9M")
println(unhospitalized.count-unhospitalized_alive.count)
println("prescriptions of live, unhospitalized patients = ~30M")
println(unhospitalized_alive.count)

prescriptions of dead unhospitalized patients = ~9M
8972091
prescriptions of live, unhospitalized patients = ~30M
30642882


### For now we are concerned with two populations, unhospitalized_alive, and hospitalized

#### Let's assume that we only want prescriptions and dates of unhospitalized patients
* timestamp type is encoded as epoch time, thus casting it to integer will yield an epoch int
* We take the following variables
    * id
    * birthyear
    * drugcode
    * timestamp

In [12]:
val uh_a_drug_date = unhospitalized_alive
    .select( $"id", $"birthyear", $"drugcode", $"timestamp".cast("integer") )
    //.show(1)

uh_a_drug_date: org.apache.spark.sql.DataFrame = [id: string, birthyear: int ... 2 more fields]


### Take 100.000 patients from the unhospitalized, "HEALTHY" dataset
#### With their prescriptions

In [13]:
val n_patients = 100000
val unhospitalized_10k_sample = uh_a_drug_date.select($"id").distinct.limit(n_patients).cache
val unhosp_sample_prescriptions = unhospitalized_10k_sample
    .select($"id".as("pasientlopenr"))
    .join(elders)
    .where("id=pasientlopenr")

n_patients: Int = 100000
unhospitalized_10k_sample: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string]
unhosp_sample_prescriptions: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [pasientlopenr: string, id: string ... 22 more fields]


#### Hospitalized patients have a different timestamp encoding
* We take the following
    * Id
    * birthyear
    * drugcode
    * timestamp

In [14]:
//val h_drug_reldate = hospitalized
//    .select($"id",$"birthyear",$"drugcode",$"diff_utleveringdato")
//    .where("diff_utleveringdato < 0")

In [None]:
val hospitalizations = spark.sql("select * from npr_elders").drop("id")

### Take an equal number of hospitalized elders
* take patients with no more than 5 hospitalizations

In [16]:
val eligible_hospitalized = hospitalizations
    .select("pasientlopenr", "num_hospitalizations","n_prescriptions", "hovedtilstand")
    .filter("num_hospitalizations <= 10")
    .distinct

eligible_hospitalized: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [pasientlopenr: string, num_hospitalizations: bigint ... 2 more fields]


In [None]:
val hospitalized_10k_sample = hospitalized
    .join(eligible_hospitalized)
    .where("pasientlopenr=id")
    .select("id")
    .distinct
    .limit(n_patients)
    .cache
//hospitalized_10k_sample.show(10)
hospitalized_10k_sample.distinct.count

#### take all prescriptions of the selected hospitalized patients

## Prescriptions of hospitalized patients

In [18]:
val hosp_sample_prescriptions = hospitalized_10k_sample.select($"id".as("pasientlopenr"))
    .join(elders)
    .where("id=pasientlopenr")
 hosp_sample_prescriptions.count

hosp_sample_prescriptions: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [pasientlopenr: string, id: string ... 22 more fields]
res8: Long = 11099081


## Hospitalization information

In [None]:
val hosp_sample_nprdata = hospitalized_10k_sample
    .join(hospitalizations)
    .where("id=pasientlopenr")
    .select("diffdager_inn","diffdager_ut", "id")

//hosp_sample_nprdata.show(2)
hosp_sample_nprdata.count

## Save the data
* N assumed "Healthy" people
* Hospitalized:
    * N hospitalized people with less than 10 hospitalizations in the period
    * All of these patients hospitalizations

In [22]:
val outputfolder = "spark-warehouse/experiment1-data/"
val hospdir = "hospitalized/"
val healthydir = "healthy/"

outputfolder: String = spark-warehouse/experiment1-data/
hospdir: String = hospitalized/
healthydir: String = healthy/


### Save hospitalization data

In [23]:
hosp_sample_nprdata
    .sort(asc("id"))
    .write
    .mode(SaveMode.Overwrite)
    .parquet(outputfolder+hospdir+"hospitalizations/")
println("ok")

ok


### Save hospitalized patients' prescription data
* Take only 4 levels of the ATC hierarchy

In [24]:
println(hosp_sample_prescriptions.count)
//hosp_sample_prescriptions.printSchema

11099081


In [25]:
%run "src/scala/transformations.scala"

In [32]:
hosp_sample_prescriptions
    .select($"id"
            ,$"diff_utleveringdato"
            ,takeNatcLevels(4)($"drugcode").as("4L_atccode")
           )
    .sort(asc("id"))
    .write
    .mode(SaveMode.Overwrite)
    .parquet(outputfolder+hospdir+"4L_prescriptions")
//    .show(5)
println("ok")

ok


### The following cell verifies that the intersection of IDs between NPR and hosp_prescription is correct

In [33]:
val common_ids = hosp_sample_nprdata.select("id").distinct
    .join(hosp_sample_prescriptions.select($"id".as("id2")).distinct)
    .where("id = id2")
println(common_ids.count)
//common_ids.show(5)
println(hosp_sample_nprdata.select("id").distinct.count)
println(hosp_sample_prescriptions.select("id").distinct.count)

100000
100000
100000


common_ids: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, id2: string]


### Save "healthy" patients prescriptions

In [31]:
println(unhosp_sample_prescriptions.count)
unhosp_sample_prescriptions
    .select($"id"
            ,$"timestamp"
            ,takeNatcLevels(4)($"drugcode").as("4L_atccode")
           )
    .write
    .mode(SaveMode.Overwrite)
    .parquet(outputfolder+healthydir+"4L_prescriptions")
 //   .show(5)
println("ok")

5853822
ok
