# Feeder
## Contents
1) Feed and combine DeFog files
2) Join on metadata
3) Join on subjects data
4) Combine DeFog into complete fog training data.

In [10]:
import os 
from utils.utils import *

# from fog.code.utils.utils import *
from pyspark.sql.functions import lit, col
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, DateType
spark = SparkSession.builder.appName("PySpark Cloud Test").getOrCreate()
# conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '16g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','16g')])

In [11]:
spark.conf.set( "spark.sql.crossJoin.enabled" , "true" ) # enable left outer join

In [12]:
spark.sparkContext.getConf().getAll()

[('spark.sql.catalogImplementation', 'hive'),
 ('spark.rpc.message.maxSize', '512'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '39731'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1684349063489'),
 ('spark.app.name', 'PySpark Cloud Test'),
 ('spark.driver.host', 'd472094d0485'),
 ('spark.ui.showConsoleProgress', 'true')]

## Feed and Combine DeFog Files


In [13]:
defog_path = 'parkinsons_data/train/defog/'
defog_files = list_blobs("msca-bdp-student-gcs", string_match=defog_path)


In [14]:
defog_md = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/defog_metadata.csv", header=True)
subjects = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/subjects.csv", header=True)

In [15]:
defog = feed_files(defog_files, prefix=defog_path, v=1, spark=spark)

File 1 of 91
File 26 of 91
File 51 of 91
File 76 of 91


In [16]:
# add data source flag: 1=defog, 0=tdcsfog
defog = defog.withColumn("SourceDefog", lit(1))

In [17]:
defog_count = defog.count()

                                                                                

In [18]:
defog.show(5)

+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+-----------+
|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|        Id|SourceDefog|
+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+-----------+
|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   3|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|          1|
|   4|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|02ea782681|          1|
+----+----+------------------+-----+---------------+----+-------+-----+-----+----------+-----------+
only showing top 5 rows



In [19]:
print(f"Number of defog observations: {defog_count}")

Number of defog observations: 13525702


## Join on Meta Data

In [20]:
defog_md.show(5)

+----------+-------+-----+----------+
|        Id|Subject|Visit|Medication|
+----------+-------+-----+----------+
|02ab235146| ab54e1|    2|        on|
|02ea782681| bf608b|    2|        on|
|06414383cf| c0b71e|    2|       off|
|092b4c1819| b6a627|    1|       off|
|0a900ed8a2| b7bd52|    2|        on|
+----------+-------+-----+----------+
only showing top 5 rows



In [21]:
defog1 = defog.join(defog_md, on="Id", how="left")
#defog1.show(5)

In [22]:
print(f"Does defog post-join df have same number of observations as original? --> {defog1.count() == defog_count}")



Does defog post-join df have same number of observations as original? --> True


                                                                                

## Join on Subjects Meta Data

In [23]:
defog2 = defog1.join(subjects, on=["Subject", "Visit"])

In [24]:
print(f"Does defog post-join #2 df have same number of observations as original? --> {defog2.count() == defog_count}")



Does defog post-join #2 df have same number of observations as original? --> True


                                                                                

In [25]:
defog2.show(5)

+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+
|Subject|Visit|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|SourceDefog|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+
| bf608b|    2|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|          1|        on| 67|  M|         7.0|         14|          57|   19|
| bf608b|    2|02ea782681|   1|-1.0|0.0344313599752663|-0.25|              0|   0|      0|false|false|          1|        on| 67|  M|         7.0|         14|          57|   19|
| bf608b|    2|02ea782681|   2|-1.0|           0.03125|-0.25|              0|   0|      0|false|false|        

## Combine files
Add in missing fields for each

In [26]:
defog_Hz = 100
defog2 = defog2.withColumn("Test", lit('null')) \
    .withColumn("TimeSeconds", convert_time(col("Time").cast(IntegerType()), Hz=defog_Hz))


In [27]:
defog2.select(["Time", "TimeSeconds"]).show(6)

+----+-----------+
|Time|TimeSeconds|
+----+-----------+
|   0|        0.0|
|   1|       0.01|
|   2|       0.02|
|   3|       0.03|
|   4|       0.04|
|   5|       0.05|
+----+-----------+
only showing top 6 rows



In [28]:
defog2.select(["Time", "TimeSeconds"]).show(6)

+----+-----------+
|Time|TimeSeconds|
+----+-----------+
|   0|        0.0|
|   1|       0.01|
|   2|       0.02|
|   3|       0.03|
|   4|       0.04|
|   5|       0.05|
+----+-----------+
only showing top 6 rows



In [31]:
defog2.show(1)

+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+----+-----------+
|Subject|Visit|        Id|Time|AccV|             AccML|AccAP|StartHesitation|Turn|Walking|Valid| Task|SourceDefog|Medication|Age|Sex|YearsSinceDx|UPDRSIII_On|UPDRSIII_Off|NFOGQ|Test|TimeSeconds|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+----+-----------+
| bf608b|    2|02ea782681|   0|-1.0|0.0441294600297506|-0.25|              0|   0|      0|false|false|          1|        on| 67|  M|         7.0|         14|          57|   19|null|        0.0|
+-------+-----+----------+----+----+------------------+-----+---------------+----+-------+-----+-----+-----------+----------+---+---+------------+-----------+------------+-----+----+-----------+
only showing top 1 row



In [32]:
defog2.printSchema()

root
 |-- Subject: string (nullable = true)
 |-- Visit: string (nullable = true)
 |-- Id: string (nullable = false)
 |-- Time: string (nullable = true)
 |-- AccV: string (nullable = true)
 |-- AccML: string (nullable = true)
 |-- AccAP: string (nullable = true)
 |-- StartHesitation: string (nullable = true)
 |-- Turn: string (nullable = true)
 |-- Walking: string (nullable = true)
 |-- Valid: string (nullable = true)
 |-- Task: string (nullable = true)
 |-- SourceDefog: integer (nullable = false)
 |-- Medication: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- YearsSinceDx: string (nullable = true)
 |-- UPDRSIII_On: string (nullable = true)
 |-- UPDRSIII_Off: string (nullable = true)
 |-- NFOGQ: string (nullable = true)
 |-- Test: string (nullable = false)
 |-- TimeSeconds: double (nullable = true)



In [33]:
# convert data types
defog2 = defog2.withColumn("AccV", col("AccV").cast(FloatType())) \
    .withColumn("AccML", col("AccML").cast(FloatType())) \
    .withColumn("AccAP", col("AccAP").cast(FloatType())) \
    .withColumn("StartHesitation", col("StartHesitation").cast(IntegerType())) \
    .withColumn("Turn", col("Turn").cast(IntegerType())) \
    .withColumn("Walking", col("Walking").cast(IntegerType())) \
    .withColumn("StartHesitation", col("StartHesitation").cast(IntegerType())) \
    .withColumn("Valid", col("Valid").cast(BooleanType())) \
    .withColumn("Task", col("Task").cast(BooleanType())) \
    .withColumn("SourceDefog", col("SourceDefog").cast(IntegerType())) \
    .withColumn("Age", col("Age").cast(IntegerType())) \
    .withColumn("YearsSinceDx", col("YearsSinceDx").cast(IntegerType())) \
    .withColumn("UPDRSIII_On", col("UPDRSIII_On").cast(IntegerType())) \
    .withColumn("UPDRSIII_Off", col("UPDRSIII_Off").cast(IntegerType())) \
    .withColumn("NFOGQ", col("NFOGQ").cast(IntegerType()))
                

In [34]:
defog2.printSchema()

root
 |-- Subject: string (nullable = true)
 |-- Visit: string (nullable = true)
 |-- Id: string (nullable = false)
 |-- Time: string (nullable = true)
 |-- AccV: float (nullable = true)
 |-- AccML: float (nullable = true)
 |-- AccAP: float (nullable = true)
 |-- StartHesitation: integer (nullable = true)
 |-- Turn: integer (nullable = true)
 |-- Walking: integer (nullable = true)
 |-- Valid: boolean (nullable = true)
 |-- Task: boolean (nullable = true)
 |-- SourceDefog: integer (nullable = false)
 |-- Medication: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- YearsSinceDx: integer (nullable = true)
 |-- UPDRSIII_On: integer (nullable = true)
 |-- UPDRSIII_Off: integer (nullable = true)
 |-- NFOGQ: integer (nullable = true)
 |-- Test: string (nullable = false)
 |-- TimeSeconds: double (nullable = true)



In [35]:
row_count = defog2.count()

                                                                                

In [36]:
#fog.write.option("header",True).parquet("gs://msca-bdp-student-gcs/parkinsons_data/train/processed/")
defog2.write.format("parquet").mode("overwrite").save("gs://msca-bdp-student-gcs/parkinsons_data/train/processed/defog")


                                                                                

In [37]:
# check files wrote out
fog_path = "parkinsons_data/train/processed/defog"
fog_files = list_blobs("msca-bdp-student-gcs", string_match=fog_path)

In [38]:
fog_files

['/',
 '/_SUCCESS',
 '/part-00000-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00001-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00002-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00003-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00004-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00005-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00006-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00007-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00008-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00009-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00010-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00011-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00012-a50e3852-5fe1-4e47-b2de-a30cd5f99ce6-c000.snappy.parquet',
 '/part-00013-a50e3852-5fe1-4e4

In [39]:
k = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/train/tdcsfog/0a89f859b5.csv", inferSchema=True, header=True)

In [40]:
k.count()

9155