Engineer tdcsfog data to join with defog

In [1]:
import os 
from utils.utils import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName("tasks").getOrCreate()
spark.sparkContext.getConf().getAll()

[('spark.sql.catalogImplementation', 'hive'),
 ('spark.rpc.message.maxSize', '512'),
 ('spark.driver.port', '46809'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '46e84e381bf4'),
 ('spark.app.id', 'local-1683402957400'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'tasks')]

### Tasks metadata description  
tasks.csv: Task metadata for series in the defog dataset. (Not relevant for the series in the tdcsfog or daily datasets.)    
&emsp; 1. Id: The data series where the task was measured.  
&emsp; 2. Begin: Time (s) the task began.  
&emsp; 3. End: Time (s) the task ended.  
&emsp; 4. Task: One of seven tasks types in the DeFOG protocol, described on this page.  


In [3]:
tasks_path = "gs://msca-bdp-student-gcs/parkinsons_data/tasks.csv"
tasks = spark.read.csv(tasks_path, header=True)

                                                                                

In [4]:
# create duration column
tasks = tasks.withColumn('Duration', F.round((tasks.End - tasks.Begin), 3))

In [5]:
# count occurrences of each task to compare later
task_counts = tasks.groupBy("Task").count().sort(F.desc("count")).collect()

task_dict ={}
for task, count in task_counts:
    task_dict[task] = count

                                                                                

In [8]:
tasks.show(5)

+----------+------+------+-----+--------+
|        Id| Begin|   End| Task|Duration|
+----------+------+------+-----+--------+
|02ab235146|  10.0|190.48|Rest1|  180.48|
|02ab235146|211.24|271.56|Rest2|   60.32|
|02ab235146|505.88| 522.4|  4MW|   16.52|
|02ab235146|577.96|594.64|4MW-C|   16.68|
|02ab235146|701.32|715.28|  MB1|   13.96|
+----------+------+------+-----+--------+
only showing top 5 rows



In [10]:
# create dummy vars
tasks_dummy = create_dummies(tasks, "Task")

In [11]:
# check that we retained original counts
test_count = tasks_dummy.groupBy().sum().collect()
test_dict = test_count[0].asDict()

for k, v in test_dict.items():
    col = k[4:-1]
    if col != "Duration":
        if v != task_dict[col]:
            print("We have retained the original counts:", False)
print("We have retained the original counts:", True)

23/05/06 19:56:55 WARN org.apache.spark.util.Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.


We have retained the original counts: True


In [12]:
tasks_dummy.show(5)

+----------+------+------+-----+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|        Id| Begin|   End| Task|Duration|MB9|Rest1|MB6-L|MB6-R|Turning-C|MB2a|MB3-L|MB12|MB5|MB3-R|MB13|TUG-DT|Turning-ST|TUG-ST|4MW-C|Hotspot2|MB6|TUG-C|4MW|Hotspot1-C|Hotspot2-C|MB8|Hotspot1|MB4|MB1|MB7|Rest2|MB2b|MB10|Turning-DT|MB11|
+----------+------+------+-----+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|02ab235146|  10.0|190.48|Rest1|  180.48|  0|    1|    0|    0|        0|   0|    0|   0|  0|    0|   0|     0|         0|     0|    0|       0|  0|    0|  0|         0|         0|  0|       0|  0|  0|  0|    0|   0|   0|         0|   0|
|02ab235146|211.24|271.56|Rest2|   60.32|  0|   

In [15]:
tasks_dummy.filter(F.col("Id") == "02ea782681").orderBy(F.col("Begin")).show()

+----------+--------+--------+----------+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|        Id|   Begin|     End|      Task|Duration|MB9|Rest1|MB6-L|MB6-R|Turning-C|MB2a|MB3-L|MB12|MB5|MB3-R|MB13|TUG-DT|Turning-ST|TUG-ST|4MW-C|Hotspot2|MB6|TUG-C|4MW|Hotspot1-C|Hotspot2-C|MB8|Hotspot1|MB4|MB1|MB7|Rest2|MB2b|MB10|Turning-DT|MB11|
+----------+--------+--------+----------+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|02ea782681|    10.0|  21.618|       4MW|  11.618|  0|    0|    0|    0|        0|   0|    0|   0|  0|    0|   0|     0|         0|     0|    0|       0|  0|    0|  1|         0|         0|  0|       0|  0|  0|  0|    0|   0|   0|         0|   0|
|02ea782681|

In [16]:
# write files to gcs
tasks_dummy.write.option("header",True) \
    .csv("gs://msca-bdp-student-gcs/parkinsons_data/train/tasks_processed/")

                                                                                

In [17]:
tasks_proc_path = "parkinsons_data/train/tasks_processed"
task_files = list_blobs("msca-bdp-student-gcs", string_match=tasks_proc_path)

In [18]:
task_files

['/', '/_SUCCESS', '/part-00000-d969f545-12a1-4a4f-a277-7ca09d7d8695-c000.csv']