# Feeder
## Contents
1) Feed and combine DeFog/TDCSFog files
2) Join on metadata
3) Join on subjects data
4) Combine DeFog and TDCSFog into complete fog training data.

In [31]:
import os 
from utils.utils import *
from pyspark.sql.functions import lit, col
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, DateType
spark = SparkSession.builder.appName("PySpark Cloud Test").getOrCreate()

In [2]:
spark.conf.set( "spark.sql.crossJoin.enabled" , "true" ) # enable left outer join

In [3]:
spark.sparkContext.getConf().getAll()

[('spark.driver.port', '42795'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.driver.host', '7a62544a6b07'),
 ('spark.rpc.message.maxSize', '512'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1683234607479'),
 ('spark.app.name', 'PySpark Cloud Test'),
 ('spark.ui.showConsoleProgress', 'true')]

## Feed and Combine DeFog/TDCSFog files

In [4]:
defog_path = 'parkinsons_data/train/defog/'
tdcsfog_path = 'parkinsons_data/train/tdcsfog/'
defog_files = list_blobs("msca-bdp-student-gcs", string_match=defog_path)
tdcsfog_files = list_blobs("msca-bdp-student-gcs", string_match=tdcsfog_path)

In [5]:
defog_md = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/defog_metadata.csv", header=True)
tdcsfog_md = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/tdcsfog_metadata.csv", header=True)
subjects = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/subjects.csv", header=True)

                                                                                

In [45]:
defog = feed_files(defog_files, prefix=defog_path, v=1, spark=spark)

File 1 of 91
File 26 of 91
File 51 of 91
File 76 of 91


In [46]:
tdcsfog = feed_files(tdcsfog_files, prefix=tdcsfog_path, v=1, spark=spark)

File 1 of 833
File 26 of 833
File 51 of 833
File 76 of 833
File 101 of 833
File 126 of 833
File 151 of 833
File 176 of 833
File 201 of 833
File 226 of 833
File 251 of 833
File 276 of 833
File 301 of 833
File 326 of 833
File 351 of 833
File 376 of 833
File 401 of 833
File 426 of 833
File 451 of 833
File 476 of 833
File 501 of 833
File 526 of 833
File 551 of 833
File 576 of 833
File 601 of 833
File 626 of 833
File 651 of 833
File 676 of 833
File 701 of 833
File 726 of 833
File 751 of 833
File 776 of 833
File 801 of 833
File 826 of 833


In [47]:
# add data source flag: 1=defog, 0=tdcsfog
defog = defog.withColumn("SourceDefog", lit(1))
tdcsfog = tdcsfog.withColumn("SourceDefog", lit(0))

In [48]:
defog_count = defog.count()
tdcsfog_count = tdcsfog.count()

In [49]:
defog.show(5)
tdcsfog.show(5)

+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+-----------+
|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|        Id|SourceDefog|
+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+-----------+
|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   3|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   4|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|          1|
+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+-----------+
only showing top 5 rows

+----+-----------------+-----------------+-----------------+------

In [50]:
print(f"Number of defog observations: {defog_count}")
print(f"Number of tdcsfog observations: {tdcsfog_count}")

Number of defog observations: 162907
Number of tdcsfog observations: 4682


## Join on Meta Data

In [51]:
defog_md.show(5)

+----------+-------+-----+----------+
|        Id|Subject|Visit|Medication|
+----------+-------+-----+----------+
|02ab235146| ab54e1|    2|        on|
|02ea782681| bf608b|    2|        on|
|06414383cf| c0b71e|    2|       off|
|092b4c1819| b6a627|    1|       off|
|0a900ed8a2| b7bd52|    2|        on|
+----------+-------+-----+----------+
only showing top 5 rows



In [52]:
tdcsfog_md.show(5)

+----------+-------+-----+----+----------+
|        Id|Subject|Visit|Test|Medication|
+----------+-------+-----+----+----------+
|003f117e14| 13abfd|    3|   2|        on|
|009ee11563| d81e3a|    4|   2|        on|
|011322847a| 203e85|    2|   2|        on|
|01d0fe7266| 203e85|    2|   1|       off|
|024418ba39| cecfb8|   19|   3|        on|
+----------+-------+-----+----+----------+
only showing top 5 rows



In [53]:
defog1 = defog.join(defog_md, on="Id", how="left")
defog1.show(5)

+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+-------+-----+----------+
|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|SourceDefog|Subject|Visit|Medication|
+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+-------+-----+----------+
|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|          1| bf608b|    2|        on|
|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|          1| bf608b|    2|        on|
|02ea782681|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|          1| bf608b|    2|        on|
|02ea782681|   3|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|          1| bf608b|    2|        on|
|02ea782681|   4|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|          1| bf608b|    2|    

In [54]:
tdcsfog1 = tdcsfog.join(tdcsfog_md, on="Id", how="left")
tdcsfog1.show(5)

+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+-----------+-------+-----+----+----------+
|        Id|Time|             AccV|            AccML|            AccAP|StartHesitation|Turn|Walking|SourceDefog|Subject|Visit|Test|Medication|
+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+-----------+-------+-----+----+----------+
|003f117e14|   0|-9.53393930253288|0.566321631981499|-1.41352531246173|              0|   0|      0|          0| 13abfd|    3|   2|        on|
|003f117e14|   1|-9.53614029997918|0.564136952175035| -1.4406209993301|              0|   0|      0|          0| 13abfd|    3|   2|        on|
|003f117e14|   2|-9.52934530245762|0.561764770716807|-1.42933154059721|              0|   0|      0|          0| 13abfd|    3|   2|        on|
|003f117e14|   3|-9.53123898558684|0.564227314497224|-1.41548975954121|              0|   0|      0|          0| 13abfd|    3|   2|        on|

In [55]:
print(f"Does defog post-join df have same number of observations as original? --> {defog1.count() == defog_count}")
print(f"Does tdcsfog post-join df have same number of observations as original? --> {tdcsfog1.count() == tdcsfog_count}")

Does defog post-join df have same number of observations as original? --> True
Does tdcsfog post-join df have same number of observations as original? --> True


## Join on Subjects Meta Data

In [56]:
defog2 = defog1.join(subjects, on=["Subject", "Visit"])
tdcsfog2 = tdcsfog1.join(subjects.drop("Visit"), on=["Subject"]) # "Visit" field not available for tdcs subjects 

In [57]:
print(f"Does defog post-join #2 df have same number of observations as original? --> {defog2.count() == defog_count}")
print(f"Does tdcsfog post-join #2 df have same number of observations as original? --> {tdcsfog2.count() == tdcsfog_count}")

Does defog post-join #2 df have same number of observations as original? --> True
Does tdcsfog post-join #2 df have same number of observations as original? --> True


In [58]:
defog2.show(5)

+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+
|Subject|Visit|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|SourceDefog|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+
| bf608b|    2|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|          1|        on| 67|  M|         7.0|         14|          57|   19|
| bf608b|    2|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|          1|        on| 67|  M|         7.0|         14|          57|   19|
| bf608b|    2|02ea782681|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|        

In [59]:
tdcsfog2.show(5)

+-------+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+-----------+-----+----+----------+---+---+------------+-----------+------------+-----+
|Subject|        Id|Time|             AccV|            AccML|            AccAP|StartHesitation|Turn|Walking|SourceDefog|Visit|Test|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|
+-------+----------+----+-----------------+-----------------+-----------------+---------------+----+-------+-----------+-----+----+----------+---+---+------------+-----------+------------+-----+
| 13abfd|003f117e14|   0|-9.53393930253288|0.566321631981499|-1.41352531246173|              0|   0|      0|          0|    3|   2|        on| 68|  F|         9.0|         17|          15|   15|
| 13abfd|003f117e14|   1|-9.53614029997918|0.564136952175035| -1.4406209993301|              0|   0|      0|          0|    3|   2|        on| 68|  F|         9.0|         17|          15|   15|
| 13abfd|003f117e14|   2|

## Combine files
Add in missing fields for each

In [60]:
defog_Hz = 100
defog2 = defog2.withColumn("Test", lit('null')) \
    .withColumn("TimeSeconds", convert_time(col("Time").cast(IntegerType()), Hz=defog_Hz))


In [61]:
defog2.select(["Time", "TimeSeconds"]).show(6)

+----+-----------+
|Time|TimeSeconds|
+----+-----------+
|   0|        0.0|
|   1|       0.01|
|   2|       0.02|
|   3|       0.03|
|   4|       0.04|
|   5|       0.05|
+----+-----------+
only showing top 6 rows



In [62]:
tdcsfog2_Hz = 128
tdcsfog2 = tdcsfog2.withColumn("Valid", lit('null')) \
    .withColumn("Task", lit('null')) \
    .withColumn("TimeSeconds", convert_time(col("Time").cast(IntegerType()), Hz=tdcsfog2_Hz))

In [63]:
defog2.select(["Time", "TimeSeconds"]).show(6)

+----+-----------+
|Time|TimeSeconds|
+----+-----------+
|   0|        0.0|
|   1|       0.01|
|   2|       0.02|
|   3|       0.03|
|   4|       0.04|
|   5|       0.05|
+----+-----------+
only showing top 6 rows



In [None]:
tdcsfog2 = tdcsfog2.select(defog2.columns)
if defog2.columns == tdcsfog2.columns:  
    fog = defog2.union(tdcsfog2)
    fog.show(5)
else:
    print("ERROR - MAKE SURE COLUMNS OF TWO DATASETS ARE EQUAL BEFORE PERFORMING UNION")

In [64]:
fog = defog2.union(tdcsfog2)
fog.show(5)

+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+----+-----------+
|Subject|Visit|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|SourceDefog|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|Test|TimeSeconds|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+----+-----------+
| bf608b|    2|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|          1|        on| 67|  M|         7.0|         14|          57|   19|null|        0.0|
| bf608b|    2|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|          1|        on| 67|  M|         7.0|         14|          57|   19|null|       0.01|
| bf608b|    2|02ea782681

In [65]:
fog.printSchema()

root
 |-- Subject: string (nullable = true)
 |-- Visit: string (nullable = true)
 |-- Id: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- AccV: string (nullable = true)
 |-- AccML: string (nullable = true)
 |-- AccAP: string (nullable = true)
 |-- StartHesitation: string (nullable = true)
 |-- Turn: string (nullable = true)
 |-- Walking: string (nullable = true)
 |-- Valid: string (nullable = true)
 |-- Task: string (nullable = true)
 |-- SourceDefog: string (nullable = true)
 |-- Medication: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- YearsSinceDx: string (nullable = true)
 |-- UPDRSIII_On: string (nullable = true)
 |-- UPDRSIII_Off: string (nullable = true)
 |-- NFOGQ: string (nullable = true)
 |-- Test: string (nullable = false)
 |-- TimeSeconds: double (nullable = true)



In [66]:
# convert data types
fog = fog.withColumn("AccV", col("AccV").cast(FloatType())) \
    .withColumn("AccML", col("AccML").cast(FloatType())) \
    .withColumn("AccAP", col("AccAP").cast(FloatType())) \
    .withColumn("StartHesitation", col("StartHesitation").cast(IntegerType())) \
    .withColumn("Turn", col("Turn").cast(IntegerType())) \
    .withColumn("Walking", col("Walking").cast(IntegerType())) \
    .withColumn("StartHesitation", col("StartHesitation").cast(IntegerType())) \
    .withColumn("Valid", col("Valid").cast(BooleanType())) \
    .withColumn("Task", col("Task").cast(BooleanType())) \
    .withColumn("SourceDefog", col("SourceDefog").cast(IntegerType())) \
    .withColumn("Age", col("Age").cast(IntegerType())) \
    .withColumn("YearsSinceDx", col("YearsSinceDx").cast(IntegerType())) \
    .withColumn("UPDRSIII_On", col("UPDRSIII_On").cast(IntegerType())) \
    .withColumn("UPDRSIII_Off", col("UPDRSIII_Off").cast(IntegerType())) \
    .withColumn("NFOGQ", col("NFOGQ").cast(IntegerType()))



                

In [67]:
fog.printSchema()

root
 |-- Subject: string (nullable = true)
 |-- Visit: string (nullable = true)
 |-- Id: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- AccV: float (nullable = true)
 |-- AccML: float (nullable = true)
 |-- AccAP: float (nullable = true)
 |-- StartHesitation: integer (nullable = true)
 |-- Turn: integer (nullable = true)
 |-- Walking: integer (nullable = true)
 |-- Valid: boolean (nullable = true)
 |-- Task: boolean (nullable = true)
 |-- SourceDefog: integer (nullable = true)
 |-- Medication: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- YearsSinceDx: integer (nullable = true)
 |-- UPDRSIII_On: integer (nullable = true)
 |-- UPDRSIII_Off: integer (nullable = true)
 |-- NFOGQ: integer (nullable = true)
 |-- Test: string (nullable = false)
 |-- TimeSeconds: double (nullable = true)



In [72]:
fog.write.option("header",True) \
    .csv("gs://msca-bdp-student-gcs/parkinsons_data/train/processed/")

                                                                                

In [73]:
# check files wrote out
fog_path = "parkinsons_data/train/processed"
fog_files = list_blobs("msca-bdp-student-gcs", string_match=fog_path)

In [74]:
fog_files

['/',
 '/_SUCCESS',
 '/part-00000-559c010c-9dc3-4566-ba1f-2e70aa86e661-c000.csv',
 '/part-00001-559c010c-9dc3-4566-ba1f-2e70aa86e661-c000.csv',
 '/part-00002-559c010c-9dc3-4566-ba1f-2e70aa86e661-c000.csv',
 '/part-00003-559c010c-9dc3-4566-ba1f-2e70aa86e661-c000.csv']