# Cell to initialize the Spark runtime and import functionality

In [None]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import spark.implicits._

val basedir = "spark-warehouse/prescription/"
sc.getConf.getAll
// set to true to execute code to display tables
val execExtras=false

### Load the dataset
- Delete rows where patient has no ID ( PasientUtenID == 1)

In [None]:
val path = "datasets/prescription"
val raw= spark.read.option("delimiter",";").option("header","true").csv(path)


val converted = raw
    .where(col("PasientUtenID").notEqual("1"))
    .select(
        $" PasientLopeNr".as("id")
        ,$"PasientFodtAr".cast("integer").as("birthyear")
        ,$"PasientKjonn".cast("integer").as("gender")
        ,unix_timestamp($"UtleveringsDato", "yyyy.MM.dd").cast(TimestampType).as("timestamp")
        ,$"ATCKode".as("drugcode")
    )
    .where("id is not null")
    .where("birthyear is not null")
    .where("gender is not null")
    .where("timestamp is not null")
    .where("drugcode is not null")
    .cache
  
 
//val fields = raw.filter(col("PasientUtenID").notEqual("1"))

//println("filtered away rows: ", raw.count-fields.count)
if (execExtras) {
converted.show(5)    
}
converted.printSchema

### Create Prescription Dataframe, taking into consideration only prescriptions and the patient's ID
* Take Patient ID, Date of prescription and Drug code(ATC Code)

In [None]:
val prescriptions = converted.select(
    $"id"
    ,$"timestamp"
    ,$"drugcode"
)
if (execExtras){
    prescriptions.show(5)    
}


### Create Patients Dataframe, carrying info of people
* We only have the ID, Birthyear and Gender

In [None]:
val patients = converted.select(
    $"id"
    ,$"birthyear"
    ,$"gender"
).distinct

if (execExtras){
    println("Total unique people in dataset", patients.count)
    patients.show(5)
}

### Drugs in dataset
* Contains 855 unique ATC codes

In [None]:
val drugs = converted.select($"drugcode").distinct.orderBy(asc("drugcode"))

if (execExtras){
    drugs.show(5)
    drugs.count    
}


## Save tables
* Should be saved in the "spark-warehouse/prescriptions/" folder from the root directory

In [None]:
patients.write
    .mode(SaveMode.Overwrite)
    .parquet(basedir+"patients")
println("ok")

In [None]:
prescriptions.write
    .mode(SaveMode.Overwrite)
    .parquet(basedir+"prescriptions")
println("ok")

In [None]:
drugs.write
    .mode(SaveMode.Overwrite)
    .parquet(basedir+"drugs")
println("ok")

In [None]:
if (execExtras){
    patients.where(col("birthyear")<1880).show
}