In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.functions import col, datediff, lit, to_date, when
from pyspark.sql.types import *
from os import walk
import json
import os
import sys
import datetime
import math

sys.path.append('../')
from Helper_Code.helpfull_functions import *
from Helper_Code.helper_variables import *
from Helper_Code.quality_checks import *


In [2]:
spark = get_spark_session("test", SparkConf())

22/07/13 14:36:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/13 14:36:48 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
22/07/13 14:36:49 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
22/07/13 14:36:49 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [3]:
def numerical_column_selection(df, column, values):
    if len(values) == 2:
        return df.filter((col(column) >= lit(values[0])) & (col(column) <= lit(values[1])))
    else:
        return df.filter((col(column) == lit(values[0])))
    
def catagorical_column_selection(df, column, values):
    return df.filter(col(column).isin(values))


def datetime_column_selection(df, values):
    if len(values) == 2:
        return df.where((df.visit_start_datetime >= lit(values[0])) & (df.visit_start_datetime <= lit(values[1])))
    else:
        return df.where((df.visit_start_datetime == lit(values[0])))

In [4]:
def demographic_pull(spark, group_value, age = [], gender = [], race=[], ethnicity = [],
                     insurance = [], visit_date = [], visit_locaiton = {}, care_site = {},
                    rand_sample_size = 0):
    
    if group_value not in ['visit_occurrence_id', 'person_id']:
        raise BaseException("Invalid group_value. Options are visit_occurrence_id and person_id")
    
    visit_occurrence = merge_files("/home/jupyter/omop-ed-datapipeline/visit_occurrence", spark, show = False)    
    visit_length_columns = ['person_id', 'visit_occurrence_id', 'visit_start_datetime', 'visit_end_datetime']
    visit_length = visit_occurrence.select([col for col in visit_length_columns])
    visit_length=visit_length.withColumn('visit_length',col("visit_end_datetime").cast("long") - col('visit_start_datetime').cast("long"))
    
    print("Visit Data Loaded...")
    
    
    person = merge_files("/home/jupyter/omop-ed-datapipeline/person", spark, show = False)
    demographics_cols = ['person_id', 'birth_datetime', 'death_datetime', 'gender_source_value', 'race_source_value', 'ethnicity_source_value', 'care_site_id', 'location_id']
    demo = person.select([col for col in demographics_cols])
    
    print("Demographic Data Loaded...")
    
    payer_plan_period = merge_files("/home/jupyter/omop-ed-datapipeline/payer_plan_period", spark, show= False)
    insurance_columns = ['person_id', 'payer_plan_period_start_date', 'payer_plan_period_end_date', 'payer_source_value', 'plan_source_value']
    insurance_df = payer_plan_period.select([col for col in insurance_columns])
    insurance_df = insurance_df.withColumn("payer_plan_period_start_datetime",to_date("payer_plan_period_start_date"))\
                        .withColumn("payer_plan_period_end_datetime",to_date("payer_plan_period_end_date"))
    
    print("Insurance Data Loaded...")
    
    care_site_df = merge_files("/home/jupyter/omop-ed-datapipeline/care_site", spark, show = False)
    care_site_columns = ['care_site_id', 'care_site_name','care_site_source_value', 'place_of_service_source_value']
    care_site_df = care_site_df.select([col for col in care_site_columns])
    
    print("Care Site Data Loaded...")
    
    location = merge_files("/home/jupyter/omop-ed-datapipeline/location", spark, show = False)
    location_columns = ['location_id', 'address_1', 'city', 'state', 'zip', 'county', 'country']
    location = location.select([col for col in location_columns])
    
    print("Patient Locaiton Data Loaded...")
    
    if group_value == 'visit_occurrence_id':
        insurance_df = insurance_df.join(visit_length, insurance_df.person_id == visit_length.person_id)
        insurance_df = insurance_df.where((insurance_df.visit_start_datetime >= insurance_df.payer_plan_period_start_datetime) & (insurance_df.visit_end_datetime <= insurance_df.payer_plan_period_end_datetime))
        insurance_df = insurance_df.distinct()
        insurance_columns = ['visit_occurrence_id', 'payer_source_value', 'plan_source_value']
        insurance_df = insurance_df.select([col for col in insurance_columns])
        
        demo = demo.join(visit_length, on = "person_id", how = "outer")
        demo = demo.withColumn("age", math.floor(datediff(col("visit_start_datetime"),col("birth_datetime"))/365.25))
        demo = demo.select([col for col in ['person_id', 'birth_datetime', 'death_datetime', 'gender_source_value', 'race_source_value', 'ethnicity_source_value', 'age', 'care_site_id', 'location_id']])
        demo = demo.distinct()
        
        print("Age and Insurance Provider Identified Data Loaded...")        
        
        
    else:
        insurance_df = insurance_df.withColumn("payer_plan_period_start_datetime",to_date("payer_plan_period_start_date"))\
                        .withColumn("payer_plan_period_end_datetime",to_date("payer_plan_period_end_date"))
        
        insurance_df = insurance_df.join(insurance_df.groupBy('person_id').agg(F.max('payer_plan_period_start_datetime').alias('payer_plan_period_start_datetime')),on='payer_plan_period_start_datetime',how='leftsemi')
        insurance_columns = ['person_id', 'payer_source_value', 'plan_source_value']
        insurance_df = insurance_df.select([col for col in insurance_columns])
        
        now = datetime.datetime.now()
        demo = demo.withColumn("age", when(col("death_datetime").isNull(),math.floor(datediff(lit(now),col("birth_datetime"))/365.25))
              .otherwise(math.floor(datediff(col('death_datetime'),col("birth_datetime"))/365.25)))
        demo = demo.select([col for col in ['person_id', 'birth_datetime', 'death_datetime', 'gender_source_value', 'race_source_value', 'ethnicity_source_value', 'age', 'care_site_id', 'location_id']])
        demo = demo.distinct()
        
        print("Age and Insurance Provider Identified Data Loaded...")
        
    main_df = demo.join(visit_length, on = "person_id", how = "outer")
    print("Demographics and Visit Length Information Combined...")
    
    main_df = main_df.join(insurance_df, on=group_value, how = "outer")
    print("Added Insurance Information Information...")
    
    
    main_df = main_df.join(care_site_df, on='care_site_id', how = "outer")
    print("Added Care Site Information...")
    
    main_df = main_df.join(location, on='location_id', how = "outer")
    print("Added Patient Locaiton Information...")
    
    if rand_sample_size == 0:
        patient_number = main_df.count()
        rand_df = main_df.limit(patient_number)
    else:
        rand_df = main_df.limit(rand_sample_size)
    
    if age != []:
        main_df = numerical_column_selection(main_df, "age", age)
        print("Filtered Data for Age...")
        
    if gender != []:
        main_df = catagorical_column_selection(main_df, "gender_source_value", gender)
        print("Filtered Data for Gender...")
        
    if race != []:
        main_df = catagorical_column_selection(main_df, "race_source_value", race)
        print("Filtered Data for Race...")
        
    if b != []:
        main_df = catagorical_column_selection(main_df, "ethnicity_source_value", ethnicity)
        print("Filtered Data for Ethnicity...")
    
    if insurance != []:
        main_df = catagorical_column_selection(main_df, "plan_source_value", insurance)
        print("Filtered Data for Insurance...")
        
    if visit_date != []:
        main_df = datetime_column_selection(main_df, visit_date)
        print("Filtered Data for Visit Date...")
        
    if visit_locaiton != {}:
        for key in visit_location.keys():
            main_df = catagorical_column_selection(main_df, key, visit_location[key])
        print("Filtered Data for Care Site...")
            
    if care_site != {}:
        for key in care_site.keys():
            main_df = catagorical_column_selection(main_df, key, care_site[key])
        print("Filtered Data for Location...")
        
    return main_df, rand_df
            
    

    
    
    
    
    

In [8]:
main_df, rand_df = demographic_pull(spark, "visit_occurrence_id", age = [30,50], rand_sample_size= 1000)

(5229969, 17)
Visit Data Loaded...
(459280, 20)
Demographic Data Loaded...
(5818931, 23)
Insurance Data Loaded...
(2693, 6)
Care Site Data Loaded...
(398511, 11)
Patient Locaiton Data Loaded...
Age and Insurance Provider Identified Data Loaded...
Demographics and Visit Length Information Combined...
Added Insurance Information Information...
Added Care Site Information...
Added Patient Locaiton Information...
Filtered Data for Age...


In [9]:
rand_df.show()

22/07/13 14:27:01 WARN ExecutorPodsSnapshotsStoreImpl: Exception when notifying snapshot subscriber.
io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://kubernetes.default.svc.cluster.local/api/v1/namespaces/ke-decile-lab/pods. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "test-24844a81f7f2a509-exec-5" is forbidden: exceeded quota: ke-decile-lab-quota, requested: requests.cpu=2, used: requests.cpu=32, limited: requests.cpu=32.
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:639)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.java:576)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:543)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:504)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupp

+--------------------+------------+-------------------+---------+-------------------+--------------+-------------------+------------------+----------------------+-----------------+--------------------+-------------------+------------+------------------+-----------------+--------------+----------------------+-----------------------------+--------------------+----------+-----+-----+---------+--------------------+
|         location_id|care_site_id|visit_occurrence_id|person_id|     birth_datetime|death_datetime|gender_source_value| race_source_value|ethnicity_source_value|              age|visit_start_datetime| visit_end_datetime|visit_length|payer_source_value|plan_source_value|care_site_name|care_site_source_value|place_of_service_source_value|           address_1|      city|state|  zip|   county|             country|
+--------------------+------------+-------------------+---------+-------------------+--------------+-------------------+------------------+----------------------+--------

                                                                                

In [17]:
def ed_vitals_pull(spark, rand_sample_size = 0, merge_with = None):
    
    codes = [3025315, 3012888, 3032652, 3027018, 3027598, 3024171, 3004249, 3026258, 3020891]
    # Body weight, Diastolic blood pressure, Glasgow coma scale, Heart rate, Mean blood pressure, Systolic blood pressure, Q-T interval corrected, Body temperature, RR  
    
    
    # First and Last vitals for the visit and keep time for each measurement
    
    vitals = merge_files("/home/jupyter/omop-ed-datapipeline/measurement", spark, show = False)
    concept = merge_files("/home/jupyter/omop-ed-datapipeline/concept", spark, show = False)
    vitals = vitals.join(concept, vitals.measurement_concept_id == concept.concept_id)
    vitals = measurement.filter(col('concept_id').isin(codes))
    main_df = vitals.groupby("visit_occurrence_id", "measurement_source_value").agg(F.max("value_as_number").alias("Max_value"),\
                                                                  F.min("value_as_number").alias("Min_value"),\
                                                                  F.mean("value_as_number").alias("Mean_value"))
    
    if rand_sample_size == 0:
        patient_number = main_df.count()
        rand_df = main_df.limit(patient_number)
    else:
        rand_df = main_df.limit(rand_sample_size)
    
    if merge_with != None:
        main_df = merge_with.join(main_df, on="visit_occurrence_id", how = "left")
    
    return main_df, rand_df




In [None]:
ed_vitals, rand_vitals = ed_vitals_pull(spark, rand_sample_size = 1000)

22/07/13 15:14:10 WARN ExecutorPodsSnapshotsStoreImpl: Exception when notifying snapshot subscriber.
io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://kubernetes.default.svc.cluster.local/api/v1/namespaces/ke-decile-lab/pods. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "test-0358db81f7fdb3a2-exec-504" is forbidden: exceeded quota: ke-decile-lab-quota, requested: requests.cpu=2, used: requests.cpu=32, limited: requests.cpu=32.
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:639)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.java:576)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:543)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:504)
	at io.fabric8.kubernetes.client.dsl.base.OperationSu

(115424304, 24)


22/07/13 15:14:20 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
22/07/13 15:14:20 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB


+---------+--------------+-----------+----------------------+----------------+--------------------+-------------------+----------------+---------------------------+-------------------+---------------+---------------+-------------------+---------------+---------+----------+-----------+-------------------+---------------+------------------------+----------------------------+-----------------------------+-----------------+------------------+
|person_id|measurement_id|specimen_id|measurement_concept_id|measurement_date|measurement_datetime|     order_datetime|measurement_time|measurement_type_concept_id|operator_concept_id|value_as_number|value_as_string|value_as_concept_id|unit_concept_id|range_low|range_high|provider_id|visit_occurrence_id|visit_detail_id|measurement_source_value|measurement_source_value_alt|measurement_source_concept_id|unit_source_value|value_source_value|
+---------+--------------+-----------+----------------------+----------------+--------------------+---------------



In [5]:
measurement = merge_files("/home/jupyter/omop-ed-datapipeline/measurement", spark, show = True)

22/07/13 14:37:23 WARN ExecutorPodsSnapshotsStoreImpl: Exception when notifying snapshot subscriber.
io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://kubernetes.default.svc.cluster.local/api/v1/namespaces/ke-decile-lab/pods. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "test-0358db81f7fdb3a2-exec-4" is forbidden: exceeded quota: ke-decile-lab-quota, requested: requests.cpu=2, used: requests.cpu=32, limited: requests.cpu=32.
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:639)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.java:576)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:543)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:504)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupp

(115424304, 24)


22/07/13 14:37:36 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
22/07/13 14:37:37 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
                                                                                

+---------+--------------+-----------+----------------------+----------------+--------------------+-------------------+----------------+---------------------------+-------------------+---------------+---------------+-------------------+---------------+---------+----------+-----------+-------------------+---------------+------------------------+----------------------------+-----------------------------+-----------------+------------------+
|person_id|measurement_id|specimen_id|measurement_concept_id|measurement_date|measurement_datetime|     order_datetime|measurement_time|measurement_type_concept_id|operator_concept_id|value_as_number|value_as_string|value_as_concept_id|unit_concept_id|range_low|range_high|provider_id|visit_occurrence_id|visit_detail_id|measurement_source_value|measurement_source_value_alt|measurement_source_concept_id|unit_source_value|value_source_value|
+---------+--------------+-----------+----------------------+----------------+--------------------+---------------

In [6]:
concept = merge_files("/home/jupyter/omop-ed-datapipeline/concept", spark, show = False)

(8107890, 10)


                                                                                

In [7]:
measurement2 = measurement.join(concept, measurement.measurement_concept_id == concept.concept_id)

In [8]:
measurement2.show()

22/07/13 14:37:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
22/07/13 14:37:47 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
22/07/13 14:37:48 WARN ExecutorPodsSnapshotsStoreImpl: Exception when notifying snapshot subscriber.
io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://kubernetes.default.svc.cluster.local/api/v1/namespaces/ke-decile-lab/pods. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "test-0358db81f7fdb3a2-exec-10" is forbidden: exceeded quota: ke-decile-lab-quota, requested: requests.cpu=2, used: requests.cpu=32, limited: requests.cpu=32.
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:639)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.

+---------+--------------+-----------+----------------------+----------------+--------------------+-------------------+----------------+---------------------------+-------------------+---------------+---------------+-------------------+---------------+---------+----------+-----------+-------------------+---------------+------------------------+----------------------------+-----------------------------+-----------------+------------------+----------+--------------------+-----------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|person_id|measurement_id|specimen_id|measurement_concept_id|measurement_date|measurement_datetime|     order_datetime|measurement_time|measurement_type_concept_id|operator_concept_id|value_as_number|value_as_string|value_as_concept_id|unit_concept_id|range_low|range_high|provider_id|visit_occurrence_id|visit_detail_id|measurement_source_value|measurement_source_value_alt|measurement_source_concept_id|

                                                                                

In [11]:
measurement2.select('concept_class_id').distinct().collect()


22/07/13 14:47:23 WARN ExecutorPodsSnapshotsStoreImpl: Exception when notifying snapshot subscriber.
io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://kubernetes.default.svc.cluster.local/api/v1/namespaces/ke-decile-lab/pods. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "test-0358db81f7fdb3a2-exec-356" is forbidden: exceeded quota: ke-decile-lab-quota, requested: requests.cpu=2, used: requests.cpu=32, limited: requests.cpu=32.
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:639)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.java:576)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:543)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:504)
	at io.fabric8.kubernetes.client.dsl.base.OperationSu

Py4JJavaError: An error occurred while calling o1421.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 275 (collect at /tmp/ipykernel_7010/3383850185.py:1) has failed the maximum allowable number of times: 4. Most recent failure reason:
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 10 partition 44
	at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1619)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1566)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1565)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1565)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorIdImpl(MapOutputTracker.scala:1230)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorId(MapOutputTracker.scala:1192)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:140)
	at org.apache.spark.shuffle.ShuffleManager.getReader(ShuffleManager.scala:63)
	at org.apache.spark.shuffle.ShuffleManager.getReader$(ShuffleManager.scala:57)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:73)
	at org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:208)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1843)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2639)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)


In [14]:
df = measurement2.filter(col('concept_id').isin(codes))
df.show()

22/07/13 15:02:13 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
22/07/13 15:02:15 WARN ExecutorPodsSnapshotsStoreImpl: Exception when notifying snapshot subscriber.
io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://kubernetes.default.svc.cluster.local/api/v1/namespaces/ke-decile-lab/pods. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "test-0358db81f7fdb3a2-exec-459" is forbidden: exceeded quota: ke-decile-lab-quota, requested: requests.cpu=2, used: requests.cpu=32, limited: requests.cpu=32.
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:639)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.java:576)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:543)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleRes

+---------+--------------+-----------+----------------------+----------------+--------------------+-------------------+----------------+---------------------------+-------------------+---------------+---------------+-------------------+---------------+---------+----------+-----------+-------------------+---------------+------------------------+----------------------------+-----------------------------+-----------------+------------------+----------+----------------+-----------+-------------+--------------------+----------------+------------+----------------+--------------+--------------+
|person_id|measurement_id|specimen_id|measurement_concept_id|measurement_date|measurement_datetime|     order_datetime|measurement_time|measurement_type_concept_id|operator_concept_id|value_as_number|value_as_string|value_as_concept_id|unit_concept_id|range_low|range_high|provider_id|visit_occurrence_id|visit_detail_id|measurement_source_value|measurement_source_value_alt|measurement_source_concept_id|



In [15]:
df = df.groupby("visit_occurrence_id", "measurement_source_value").agg(F.max("value_as_number").alias("Max_value"),\
                                                                  F.min("value_as_number").alias("Min_value"),\
                                                                  F.mean("value_as_number").alias("Mean_value"))

df.show()

22/07/13 15:07:39 WARN DAGScheduler: Broadcasting large task binary with size 1736.2 KiB
22/07/13 15:07:42 WARN ExecutorPodsSnapshotsStoreImpl: Exception when notifying snapshot subscriber.
io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://kubernetes.default.svc.cluster.local/api/v1/namespaces/ke-decile-lab/pods. Message: Forbidden!Configured service account doesn't have access. Service account may have been revoked. pods "test-0358db81f7fdb3a2-exec-485" is forbidden: exceeded quota: ke-decile-lab-quota, requested: requests.cpu=2, used: requests.cpu=32, limited: requests.cpu=32.
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.requestFailure(OperationSupport.java:639)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.assertResponseCode(OperationSupport.java:576)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:543)
	at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handle

+-------------------+------------------------+---------+---------+-----------------+
|visit_occurrence_id|measurement_source_value|Max_value|Min_value|       Mean_value|
+-------------------+------------------------+---------+---------+-----------------+
|          112304711|      ECG - QTC Interval|    429.0|    429.0|            429.0|
|          108854665|      ECG - QTC Interval|    421.0|    421.0|            421.0|
|          117003266|      ECG - QTC Interval|    451.0|    451.0|            451.0|
|          268987732|      ECG - QTC Interval|    442.0|    442.0|            442.0|
|          177833976|      ECG - QTC Interval|    434.0|    434.0|            434.0|
|          268245323|      ECG - QTC Interval|    443.0|    443.0|            443.0|
|           28256750|      ECG - QTC Interval|    411.0|    411.0|            411.0|
|           60641581|      ECG - QTC Interval|    452.0|    452.0|            452.0|
|          101079276|      ECG - QTC Interval|    440.0|    440.0

