Join tasks data with fog

In [1]:
import os 
from utils.utils import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName("tasks").getOrCreate()
spark.sparkContext.getConf().getAll()

[('spark.sql.catalogImplementation', 'hive'),
 ('spark.rpc.message.maxSize', '512'),
 ('spark.driver.port', '40627'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1683583606786'),
 ('spark.driver.host', '7c16108dc9e9'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'tasks')]

### Tasks metadata description  
tasks.csv: Task metadata for series in the defog dataset. (Not relevant for the series in the tdcsfog or daily datasets.)    
&emsp; 1. Id: The data series where the task was measured.  
&emsp; 2. Begin: Time (s) the task began.  
&emsp; 3. End: Time (s) the task ended.  
&emsp; 4. Task: One of seven tasks types in the DeFOG protocol, described on this page.  


In [3]:
tasks_path = "gs://msca-bdp-student-gcs/parkinsons_data/tasks.csv"
tasks = spark.read.csv(tasks_path, header=True)

                                                                                

In [4]:
# 1. convert times from string to double
# 2. rename Type column to TypeName
# 3. create dummy variables
# 4. merge with fog dataset

In [5]:
# convert times to double
tasks = tasks.withColumn("Begin",tasks.Begin.cast('double')) \
    .withColumn("End",tasks.End.cast('double'))

In [6]:
# rename tasks
tasks = tasks.withColumnRenamed("Task","TaskType")

In [7]:
# count occurrences of each task to compare later
task_counts = tasks.groupBy("TaskType").count().sort(F.desc("count")).collect()

task_dict ={}
for task, count in task_counts:
    task_dict[task] = count

                                                                                

In [8]:
# create dummy vars
tasks_dummy = create_dummies(tasks, "TaskType")

                                                                                

In [9]:
# check that we retained original counts
test_count = tasks_dummy.groupBy().sum().collect()
test_dict = test_count[0].asDict()
for k, v in test_dict.items():
    col = k[4:-1]
    if col != "Begin" and col != "End":
        if v != task_dict[col]:
            print("We have retained the original counts:", False)
print("We have retained the original counts:", True)

23/05/08 22:07:27 WARN org.apache.spark.util.Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.


We have retained the original counts: True


In [10]:
tasks_dummy.show(5)

+----------+------+------+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|        Id| Begin|   End|TaskType|MB9|Rest1|MB6-L|MB6-R|Turning-C|MB2a|MB3-L|MB12|MB5|MB3-R|MB13|TUG-DT|Turning-ST|TUG-ST|4MW-C|Hotspot2|MB6|TUG-C|4MW|Hotspot1-C|Hotspot2-C|MB8|Hotspot1|MB4|MB1|MB7|Rest2|MB2b|MB10|Turning-DT|MB11|
+----------+------+------+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|02ab235146|  10.0|190.48|   Rest1|  0|    1|    0|    0|        0|   0|    0|   0|  0|    0|   0|     0|         0|     0|    0|       0|  0|    0|  0|         0|         0|  0|       0|  0|  0|  0|    0|   0|   0|         0|   0|
|02ab235146|211.24|271.56|   Rest2|  0|    0|    0|    0|        0|   0|

In [11]:
# load in fog
fog_path = "gs://msca-bdp-student-gcs/parkinsons_data/train/processed/"
fog = spark.read.csv(fog_path, header=True)

In [12]:
# convert fog TimeSeconds to float
fog = fog.withColumn("TimeSeconds",fog.TimeSeconds.cast('double'))

In [13]:
# confirm it worked
fog.printSchema()

root
 |-- Subject: string (nullable = true)
 |-- Visit: string (nullable = true)
 |-- Id: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- AccV: string (nullable = true)
 |-- AccML: string (nullable = true)
 |-- AccAP: string (nullable = true)
 |-- StartHesitation: string (nullable = true)
 |-- Turn: string (nullable = true)
 |-- Walking: string (nullable = true)
 |-- Valid: string (nullable = true)
 |-- Task: string (nullable = true)
 |-- SourceDefog: string (nullable = true)
 |-- Medication: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- YearsSinceDx: string (nullable = true)
 |-- UPDRSIII_On: string (nullable = true)
 |-- UPDRSIII_Off: string (nullable = true)
 |-- NFOGQ: string (nullable = true)
 |-- Test: string (nullable = true)
 |-- TimeSeconds: double (nullable = true)



In [14]:
# join fog with tasks

cond = (tasks_dummy.Id == fog.Id) & (fog.TimeSeconds.between(tasks_dummy.Begin, tasks_dummy.End))
fog_tasks = fog \
    .join(tasks_dummy, how='left', on=cond)\
    .drop(tasks_dummy.Id)

In [126]:
# write files to gcs
fog_tasks.write.option("header",True) \
    .csv("gs://msca-bdp-student-gcs/parkinsons_data/train/fog_tasks/")

                                                                                

In [15]:
fog_tasks_path = "parkinsons_data/train/fog_tasks"
fog_task_files = list_blobs("msca-bdp-student-gcs", string_match=fog_tasks_path)

In [16]:
fog_task_files

['/',
 '/_SUCCESS',
 '/part-00000-74e56169-b0fd-4f00-8040-080aa3b7a5b1-c000.csv',
 '/part-00001-74e56169-b0fd-4f00-8040-080aa3b7a5b1-c000.csv',
 '/part-00002-74e56169-b0fd-4f00-8040-080aa3b7a5b1-c000.csv']