# Feeder
## Contents
1) Feed and combine DeFog/TDCSFog files
2) Join on metadata
3) Join on subjects data
4) Combine DeFog and TDCSFog into complete fog training data.

In [1]:
import os 
from utils.utils import *
from pyspark.sql.functions import lit
spark = SparkSession.builder.appName("PySpark Cloud Test").getOrCreate()

In [2]:
spark.conf.set( "spark.sql.crossJoin.enabled" , "true" ) # enable left outer join

In [3]:
spark.sparkContext.getConf().getAll()

[('spark.sql.catalogImplementation', 'hive'),
 ('spark.rpc.message.maxSize', '512'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '89b468a19a52'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '42867'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1683233390695'),
 ('spark.app.name', 'PySpark Cloud Test'),
 ('spark.ui.showConsoleProgress', 'true')]

## Feed and Combine DeFog/TDCSFog files

In [4]:
defog_path = 'parkinsons_data/train/defog/'
tdcsfog_path = 'parkinsons_data/train/tdcsfog/'
defog_files = list_blobs("msca-bdp-student-gcs", string_match=defog_path)
tdcsfog_files = list_blobs("msca-bdp-student-gcs", string_match=tdcsfog_path)

In [5]:
defog_md = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/defog_metadata.csv", header=True)
tdcsfog_md = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/tdcsfog_metadata.csv", header=True)
subjects = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/subjects.csv", header=True)

                                                                                

In [6]:
defog = feed_files(defog_files, prefix=defog_path, v=1, spark=spark)

File 1 of 91
File 11 of 91
File 21 of 91
File 31 of 91
File 41 of 91
File 51 of 91
File 61 of 91
File 71 of 91
File 81 of 91
File 91 of 91


In [7]:
tdcsfog = feed_files(tdcsfog_files, prefix=tdcsfog_path, v=1, spark=spark)

File 1 of 833
File 11 of 833
File 21 of 833
File 31 of 833
File 41 of 833
File 51 of 833
File 61 of 833
File 71 of 833
File 81 of 833
File 91 of 833
File 101 of 833
File 111 of 833
File 121 of 833
File 131 of 833
File 141 of 833
File 151 of 833
File 161 of 833
File 171 of 833
File 181 of 833
File 191 of 833
File 201 of 833
File 211 of 833
File 221 of 833
File 231 of 833
File 241 of 833
File 251 of 833
File 261 of 833
File 271 of 833
File 281 of 833
File 291 of 833
File 301 of 833
File 311 of 833
File 321 of 833
File 331 of 833
File 341 of 833
File 351 of 833
File 361 of 833
File 371 of 833
File 381 of 833
File 391 of 833
File 401 of 833
File 411 of 833
File 421 of 833
File 431 of 833
File 441 of 833
File 451 of 833
File 461 of 833
File 471 of 833
File 481 of 833
File 491 of 833
File 501 of 833
File 511 of 833
File 521 of 833
File 531 of 833
File 541 of 833
File 551 of 833
File 561 of 833
File 571 of 833
File 581 of 833
File 591 of 833
File 601 of 833
File 611 of 833
File 621 of 833
Fil

In [13]:
defog = defog.withColumn("source_defog", lit(1))
tdcsfog = tdcsfog.withColumn("source_defog", lit(0))

In [14]:
defog_count = defog.count()
tdcsfog_count = tdcsfog.count()

In [20]:
defog.show(5)
tdcsfog.show(5)

+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+------------+
|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|        Id|source_defog|
+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+------------+
|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|02ea782681|           1|
|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|02ea782681|           1|
|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|           1|
|   3|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|           1|
|   4|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|           1|
+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+------------+
only showing top 5 rows

+----+-----------------+-----------------+---------------

In [19]:
print(f"Number of defog observations: {defog_count}")
print(f"Number of tdcsfog observations: {tdcsfog_count}")

Number of defog observations: 162907
Number of tdcsfog observations: 4682


## Join on Meta Data

In [18]:
defog_md.show(5)

+----------+-------+-----+----------+
|        Id|Subject|Visit|Medication|
+----------+-------+-----+----------+
|02ab235146| ab54e1|    2|        on|
|02ea782681| bf608b|    2|        on|
|06414383cf| c0b71e|    2|       off|
|092b4c1819| b6a627|    1|       off|
|0a900ed8a2| b7bd52|    2|        on|
+----------+-------+-----+----------+
only showing top 5 rows



In [23]:
tdcsfog_md.show(5)

+----------+-------+-----+----+----------+
|        Id|Subject|Visit|Test|Medication|
+----------+-------+-----+----+----------+
|003f117e14| 13abfd|    3|   2|        on|
|009ee11563| d81e3a|    4|   2|        on|
|011322847a| 203e85|    2|   2|        on|
|01d0fe7266| 203e85|    2|   1|       off|
|024418ba39| cecfb8|   19|   3|        on|
+----------+-------+-----+----+----------+
only showing top 5 rows



In [26]:
defog1 = defog.join(defog_md, on="Id", how="left")
defog1.show(5)

+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+-------+-----+----------+
|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|source_defog|Subject|Visit|Medication|
+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+-------+-----+----------+
|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|           1| bf608b|    2|        on|
|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|           1| bf608b|    2|        on|
|02ea782681|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|           1| bf608b|    2|        on|
|02ea782681|   3|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|           1| bf608b|    2|        on|
|02ea782681|   4|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|           1| bf608b|  

In [27]:
tdcsfog1 = tdcsfog.join(tdcsfog_md, on="Id", how="left")
tdcsfog1.show(5)

+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+------------+-------+-----+----+----------+
|        Id|Time|             AccV|            AccML|            AccAP|StartHesitation|Turn|Walking|source_defog|Subject|Visit|Test|Medication|
+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+------------+-------+-----+----+----------+
|003f117e14|   0|-9.53393930253288|0.566321631981499|-1.41352531246173|              0|   0|      0|           0| 13abfd|    3|   2|        on|
|003f117e14|   1|-9.53614029997918|0.564136952175035| -1.4406209993301|              0|   0|      0|           0| 13abfd|    3|   2|        on|
|003f117e14|   2|-9.52934530245762|0.561764770716807|-1.42933154059721|              0|   0|      0|           0| 13abfd|    3|   2|        on|
|003f117e14|   3|-9.53123898558684|0.564227314497224|-1.41548975954121|              0|   0|      0|           0| 13abfd|    3|   2|    

In [28]:
print(f"Does defog post-join df have same number of observations as original? --> {defog1.count() == defog_count}")
print(f"Does tdcsfog post-join df have same number of observations as original? --> {tdcsfog1.count() == tdcsfog_count}")

Does defog post-join df have same number of observations as original? --> True
Does tdcsfog post-join df have same number of observations as original? --> True


## Join on Subjects Meta Data

In [41]:
defog2 = defog1.join(subjects, on=["Subject", "Visit"])
tdcsfog2 = tdcsfog1.join(subjects.drop("Visit"), on=["Subject"]) # "Visit" field not available for tdcs subjects 

In [42]:
print(f"Does defog post-join #2 df have same number of observations as original? --> {defog2.count() == defog_count}")
print(f"Does tdcsfog post-join #2 df have same number of observations as original? --> {tdcsfog2.count() == tdcsfog_count}")

Does defog post-join #2 df have same number of observations as original? --> True
Does tdcsfog post-join #2 df have same number of observations as original? --> True


In [43]:
defog2.show(5)

+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+----------+---+---+------------+-----------+------------+-----+
|Subject|Visit|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|source_defog|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+----------+---+---+------------+-----------+------------+-----+
| bf608b|    2|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|           1|        on| 67|  M|         7.0|         14|          57|   19|
| bf608b|    2|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|           1|        on| 67|  M|         7.0|         14|          57|   19|
| bf608b|    2|02ea782681|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|   

In [45]:
tdcsfog2.show(5)

+-------+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+------------+-----+----+----------+---+---+------------+-----------+------------+-----+
|Subject|        Id|Time|             AccV|            AccML|            AccAP|StartHesitation|Turn|Walking|source_defog|Visit|Test|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|
+-------+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+------------+-----+----+----------+---+---+------------+-----------+------------+-----+
| 13abfd|003f117e14|   0|-9.53393930253288|0.566321631981499|-1.41352531246173|              0|   0|      0|           0|    3|   2|        on| 68|  F|         9.0|         17|          15|   15|
| 13abfd|003f117e14|   1|-9.53614029997918|0.564136952175035| -1.4406209993301|              0|   0|      0|           0|    3|   2|        on| 68|  F|         9.0|         17|          15|   15|
| 13abfd|003f117e14|

## Combine files
Add in missing fields for each

In [48]:
defog2 = defog2.withColumn("Test", lit('null'))
defog2.show(5)

+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+----------+---+---+------------+-----------+------------+-----+----+
|Subject|Visit|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|source_defog|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|Test|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+----------+---+---+------------+-----------+------------+-----+----+
| bf608b|    2|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|           1|        on| 67|  M|         7.0|         14|          57|   19|null|
| bf608b|    2|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|           1|        on| 67|  M|         7.0|         14|          57|   19|null|
| bf608b|    2|02ea782681|   2|-1.0|           0.03125|-0.25|              0|   

In [52]:
tdcsfog2 = tdcsfog2.withColumn("Valid", lit('null')) \
    .withColumn("Task", lit('null'))
tdcsfog2.show(5)

+-------+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+------------+-----+----+----------+---+---+------------+-----------+------------+-----+-----+----+
|Subject|        Id|Time|             AccV|            AccML|            AccAP|StartHesitation|Turn|Walking|source_defog|Visit|Test|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|Valid|Task|
+-------+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+------------+-----+----+----------+---+---+------------+-----------+------------+-----+-----+----+
| 13abfd|003f117e14|   0|-9.53393930253288|0.566321631981499|-1.41352531246173|              0|   0|      0|           0|    3|   2|        on| 68|  F|         9.0|         17|          15|   15| null|null|
| 13abfd|003f117e14|   1|-9.53614029997918|0.564136952175035| -1.4406209993301|              0|   0|      0|           0|    3|   2|        on| 68|  F|         9.0|        

In [66]:
fog = defog2.union(tdcsfog2)
fog.show(5)

+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+----------+---+---+------------+-----------+------------+-----+----+
|Subject|Visit|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|source_defog|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|Test|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+------------+----------+---+---+------------+-----------+------------+-----+----+
| bf608b|    2|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|           1|        on| 67|  M|         7.0|         14|          57|   19|null|
| bf608b|    2|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|           1|        on| 67|  M|         7.0|         14|          57|   19|null|
| bf608b|    2|02ea782681|   2|-1.0|           0.03125|-0.25|              0|   

In [59]:
fog.write.option("header",True) \
    .csv("gs://msca-bdp-student-gcs/parkinsons_data/train/processed/")

                                                                                

In [89]:
fog_path = "parkinsons_data/train/processed"
fog_files = list_blobs("msca-bdp-student-gcs", string_match=fog_path)

In [90]:
fog_files

['/',
 '/_SUCCESS',
 '/part-00000-a4d71fbb-fb61-410c-8a1e-3bf4fc19d28b-c000.csv',
 '/part-00001-a4d71fbb-fb61-410c-8a1e-3bf4fc19d28b-c000.csv',
 '/part-00002-a4d71fbb-fb61-410c-8a1e-3bf4fc19d28b-c000.csv',
 '/part-00003-a4d71fbb-fb61-410c-8a1e-3bf4fc19d28b-c000.csv']