### Read in and Combine Daily Data (stored as parquet files)

In [2]:
import os
# from utils.utils import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from google.cloud import storage
from pyspark.sql.functions import max, col, count, \
    lit, countDistinct, first
from pyspark.sql.types import FloatType

import matplotlib.pyplot as plt
%matplotlib inline

import time

In [3]:
start_time = time.time()

spark = SparkSession.builder.appName('ParquetExample').getOrCreate()
spark
# conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])
spark.sparkContext.getConf().getAll()

[('spark.stage.maxConsecutiveAttempts', '10'),
 ('spark.dynamicAllocation.minExecutors', '1'),
 ('spark.ui.proxyBase', '/proxy/application_1683664764646_0023'),
 ('spark.submit.pyFiles',
  '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-99

### Paths and Data read

In [4]:
defog_metadata = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/defog_metadata.csv", header=True)
subjects = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/subjects.csv", header=True)
daily_metadata = spark.read.csv("gs://msca-bdp-student-gcs/parkinsons_data/daily_metadata.csv", header=True)

                                                                                

In [5]:
daily_path = "parkinsons_data/unlabeled"
top_bucket_name = "msca-bdp-student-gcs"
# full_path = "msca-bdp-student-gcs/parkinsons_data/unlabeled"

Read in 65 parquet files for daily data

In [6]:
# https://cloud.google.com/storage/docs/samples/storage-list-files#storage_list_files-python
# add file name as column - https://sparkbyexamples.com/pyspark/pyspark-lit-add-literal-constant/
# https://www.geeksforgeeks.org/how-to-union-multiple-dataframe-in-pyspark/

client = storage.Client()
blobs = client.list_blobs(top_bucket_name, prefix = daily_path)

daily_data = None
for blob in blobs:
    file_name = blob.name.split("/")[-1].split(".")[0]
    print("file name:", file_name)
    df = spark.read.parquet(f"gs://{top_bucket_name}/{blob.name}")
    df = df.withColumn("Id", lit(file_name))
    if daily_data is None:
        daily_data = df
    else:
        daily_data = daily_data.union(df)

   

file name: 00c4c9313d


                                                                                

file name: 07a96f89ec
file name: 0d1bc672a8
file name: 0e333c9833
file name: 164adaed7b
file name: 17e0c0dc86
file name: 1c3719ea59
file name: 1cf80df2d6
file name: 24016102f2
file name: 276630050d
file name: 28e6c306ba
file name: 2caa348298
file name: 32bdbc35a0
file name: 3ae6b0f79f
file name: 3bd159ded0
file name: 3f51a63612
file name: 3fc03f01ed
file name: 3fe2624b51
file name: 40bf6c162f
file name: 418a1ca2c1
file name: 43ac46d679
file name: 48081794eb
file name: 48b636e0f5
file name: 4b84027351
file name: 4e44a97a85
file name: 52fd07ea27
file name: 5535c94fc9
file name: 57741bad42
file name: 5bf570bb7b
file name: 5e13d48878
file name: 6e0303484e
file name: 6ed2f109c3
file name: 74f1e1e0ba
file name: 7ab610bb34
file name: 831c13620e
file name: 88f67f91db
file name: 8959244e1c
file name: 8ca674a988
file name: 924e997065
file name: 93abd37fee
file name: 96f57b4a40
file name: 9da3e3dc66
file name: 9fb7805d99
file name: a213c90b02
file name: b15168b788
file name: b18354d4aa
file name:

AnalysisException: Unable to infer schema for Parquet. It must be specified manually.

In [7]:
daily_data.printSchema()

root
 |-- Time: long (nullable = true)
 |-- AccV: double (nullable = true)
 |-- AccML: double (nullable = true)
 |-- AccAP: double (nullable = true)
 |-- Id: string (nullable = false)



In [8]:
daily_data.show(5)



+----+------------------+------------------+-----------------+----------+
|Time|              AccV|             AccML|            AccAP|        Id|
+----+------------------+------------------+-----------------+----------+
|   0|          0.328125|         -0.109375|         0.671875|00c4c9313d|
|   1| 0.453107990150706|  -0.1247208674257|0.811273150079803|00c4c9313d|
|   2| 0.423042391192052|-0.264046005447829|0.921238212647563|00c4c9313d|
|   3| 0.150014724987375|-0.310240837149531|0.937482659979879|00c4c9313d|
|   4|-0.202003096762013|-0.545907654638822|0.890842282170504|00c4c9313d|
+----+------------------+------------------+-----------------+----------+
only showing top 5 rows



                                                                                

In [9]:
#display number of records by partition

def displaypartitions(df):
    #number of records by partition
    num = df.rdd.getNumPartitions()
    print("Partitions:", num)
    df.withColumn("partitionId", F.spark_partition_id())\
        .groupBy("partitionId")\
        .count()\
        .orderBy(F.asc("count"))\
        .show(num)
    
# displaypartitions(daily_data)

In [None]:
# consider repartitioning by id

In [None]:
# daily_data = daily_data.repartition(F.col("Id"))
# displaypartitions(daily_data)

In [10]:
# Check number of ids = 65
num_ids = daily_data.select("Id").distinct().count()
print(f"Number of unique ids: {num_ids}")




Number of unique ids: 65


                                                                                

In [None]:
# max_time = daily_data.select(max("Time")).collect()[0][0]
# max_time


In [None]:
# max_AccV = daily_data.select(max("AccV")).collect()[0][0]
# max_AccV

In [11]:
daily_data.printSchema()

root
 |-- Time: long (nullable = true)
 |-- AccV: double (nullable = true)
 |-- AccML: double (nullable = true)
 |-- AccAP: double (nullable = true)
 |-- Id: string (nullable = false)



In [12]:
 daily_data = daily_data.withColumn("AccV", col("AccV").cast(FloatType())) \
    .withColumn("AccML", col("AccML").cast(FloatType())) \
    .withColumn("AccAP", col("AccAP").cast(FloatType()))

In [13]:
daily_data.printSchema()

root
 |-- Time: long (nullable = true)
 |-- AccV: float (nullable = true)
 |-- AccML: float (nullable = true)
 |-- AccAP: float (nullable = true)
 |-- Id: string (nullable = false)



## Join with Subject ID

In [14]:
daily_metadata.show(1)

+----------+-------+-----+------------------------------------+
|        Id|Subject|Visit|Beginning of recording [00:00-23:59]|
+----------+-------+-----+------------------------------------+
|00c4c9313d| 3d8b73|    1|                               10:19|
+----------+-------+-----+------------------------------------+
only showing top 1 row



In [15]:
daily_data = daily_data.join(daily_metadata, on="Id", how="left")


In [16]:
spark.conf.set("spark.sql.adaptive.enabled", "false")
daily_data.show(2)



+----------+----+----------+------------+----------+-------+-----+------------------------------------+
|        Id|Time|      AccV|       AccML|     AccAP|Subject|Visit|Beginning of recording [00:00-23:59]|
+----------+----+----------+------------+----------+-------+-----+------------------------------------+
|00c4c9313d|   0|  0.328125|   -0.109375|  0.671875| 3d8b73|    1|                               10:19|
|00c4c9313d|   1|0.45310798|-0.124720864|0.81127316| 3d8b73|    1|                               10:19|
+----------+----+----------+------------+----------+-------+-----+------------------------------------+
only showing top 2 rows



                                                                                

In [17]:
# Check merge - have 65 unique subjects
count_unique_subjects = daily_data.select("Subject").distinct().count()
count_unique_subjects

                                                                                

65

### Join with subject.csv, which has the NFOGQ variable
NFOGQ > 0 = in deFoG too (should be 45 people)

NFOGQ = 0 = not in deFoG (should be 20 people)

In [18]:
subjects_nfogq = subjects.select("Subject", "NFOGQ")
daily_data = daily_data.join(subjects_nfogq, on="Subject", how="left")
daily_data.select("Subject", "Id", "Accv", "NFOGq").show(10)




+-------+----------+------------+-----+
|Subject|        Id|        Accv|NFOGq|
+-------+----------+------------+-----+
| 3d8b73|00c4c9313d|    0.328125|    0|
| 3d8b73|00c4c9313d|  0.45310798|    0|
| 3d8b73|00c4c9313d|   0.4230424|    0|
| 3d8b73|00c4c9313d|  0.15001473|    0|
| 3d8b73|00c4c9313d| -0.20200309|    0|
| 3d8b73|00c4c9313d|-0.084644675|    0|
| 3d8b73|00c4c9313d|  0.04336725|    0|
| 3d8b73|00c4c9313d|  0.07791656|    0|
| 3d8b73|00c4c9313d| -0.21340606|    0|
| 3d8b73|00c4c9313d|-0.039476827|    0|
+-------+----------+------------+-----+
only showing top 10 rows



                                                                                

### Add Time in Seconds Variable

In [19]:
daily_data = daily_data.withColumn("TimeSeconds", col("Time") / 100)
daily_data.show(5)



+-------+----------+----+-----------+------------+----------+-----+------------------------------------+-----+-----------+
|Subject|        Id|Time|       AccV|       AccML|     AccAP|Visit|Beginning of recording [00:00-23:59]|NFOGQ|TimeSeconds|
+-------+----------+----+-----------+------------+----------+-----+------------------------------------+-----+-----------+
| 3d8b73|00c4c9313d|   0|   0.328125|   -0.109375|  0.671875|    1|                               10:19|    0|        0.0|
| 3d8b73|00c4c9313d|   1| 0.45310798|-0.124720864|0.81127316|    1|                               10:19|    0|       0.01|
| 3d8b73|00c4c9313d|   2|  0.4230424|   -0.264046| 0.9212382|    1|                               10:19|    0|       0.02|
| 3d8b73|00c4c9313d|   3| 0.15001473| -0.31024083|0.93748266|    1|                               10:19|    0|       0.03|
| 3d8b73|00c4c9313d|   4|-0.20200309|  -0.5459077|0.89084226|    1|                               10:19|    0|       0.04|
+-------+-------

                                                                                

### Count how many people have 0 nfogq, and > 0 nfogq 
collapsed_daily = daily_data.groupBy("Subject").agg(first("Id").alias("Id"), first("NFOGq").alias("NFOGq"))
collapsed_daily.show(2)

In [None]:
# Count subjects where nfogq = 0 NFOGQ, nfogq > 0
# collapsed_daily = daily_data.groupBy("Subject").agg(first("Id").alias("Id"), first("NFOGq").alias("NFOGq"))
# collapsed_daily.show(2)

In [None]:
collapsed_daily.count()

In [None]:
# zero_count_nfogq = collapsed_daily.filter(collapsed_daily["NFOGq"] == 0).count()
# nonzero_count_nonfogq = collapsed_daily.filter(collapsed_daily["NFOGq"] > 0).count()

In [None]:
# number of people for which nfogq = 0 (not in defog)
# zero_count_nfogq


In [None]:
# number of people for which nfogq != 0 (yes in defog)
# nonzero_count_nonfogq

### compare subject ids in daily metadata to the subject ids in defog data

In [20]:
distinct_defog_subj = defog_metadata.select("Subject")
distinct_defog_subj.distinct().count()

                                                                                

45

In [21]:
distinct_daily_subj = daily_metadata.select("Subject")
distinct_daily_subj.distinct().count()

                                                                                

65

In [22]:
combine_defog_daily_meta = distinct_daily_subj.join(distinct_defog_subj, on="Subject", how="inner")
combine_defog_daily_meta.show(5)
count = combine_defog_daily_meta.select("Subject").distinct().count()

print(f"{count} subject ids appear in both the daily data and the defog data")

+-------+
|Subject|
+-------+
| bf608b|
| b6a627|
| b7bd52|
| 39f9c0|
| d79889|
+-------+
only showing top 5 rows





36 subject ids appear in both the daily data and the defog data


                                                                                

### Create a dataframe with only those with fog (aka in the defog data), and dataframe for those not in it (Not sure we need this...)

In [None]:
# daily_in_defog = daily_data.filter(daily_data["NFOGq"] > 0)
# daily_no_defog = daily_data.filter(daily_data["NFOGq"] == 0)

In [None]:
# daily_in_defog.show(2)
# daily_no_defog.show(2)

In [23]:
end_time = time.time()
total = end_time - start_time
total


1120.0213534832

In [24]:
daily_data.show(1)



+-------+----------+----+--------+---------+--------+-----+------------------------------------+-----+-----------+
|Subject|        Id|Time|    AccV|    AccML|   AccAP|Visit|Beginning of recording [00:00-23:59]|NFOGQ|TimeSeconds|
+-------+----------+----+--------+---------+--------+-----+------------------------------------+-----+-----------+
| 3d8b73|00c4c9313d|   0|0.328125|-0.109375|0.671875|    1|                               10:19|    0|        0.0|
+-------+----------+----+--------+---------+--------+-----+------------------------------------+-----+-----------+
only showing top 1 row



                                                                                

In [None]:
# save the data 
# cut it down to what we need for the model
# save it as a parquet file on gcs
# daily_data.save("gs://msca-bdp-student-gcs/parkinsons_data/unlabeled_combined/daily_combined", format="parquet", mode = "overwrite")
daily_data = daily_data.withColumnRenamed('Beginning of recording [00:00-23:59]',"record_begin")
daily_data.printSchema()
daily_data.write.format("parquet").mode("overwrite").save("gs://msca-bdp-student-gcs/parkinsons_data/unlabeled_combined/daily_combined")


root
 |-- Subject: string (nullable = true)
 |-- Id: string (nullable = false)
 |-- Time: long (nullable = true)
 |-- AccV: float (nullable = true)
 |-- AccML: float (nullable = true)
 |-- AccAP: float (nullable = true)
 |-- Visit: string (nullable = true)
 |-- record_begin: string (nullable = true)
 |-- NFOGQ: string (nullable = true)
 |-- TimeSeconds: double (nullable = true)



                                                                                

### check out how many subject ids are in the defog data 