Join tasks data with fog, transform target variable

In [1]:
import os 
#from utils.utils import *
from fog.code.utils.utils import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName("tasks").getOrCreate()
spark.sparkContext.getConf().getAll()

[('spark.app.id', 'application_1684436604634_0023'),
 ('spark.dynamicAllocation.minExecutors', '1'),
 ('spark.driver.port', '35673'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.dataproc.metrics.listener.metrics.collector.hostname', 'bdp-zm-m'),
 ('spark.dataproc.sql.parquet.enableFooterCache', 'true'),
 ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),
 ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',
  'bdp-zm-m'),
 ('spark.executor.memory', '5739m'),
 ('spark.history.fs.logDirectory',
  'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/3b0a7a61-e426-4c65-998e-695fa4b5fd84/spark-job-history'),
 ('spark.yarn.am.memory', '640m'),
 ('spark.driver.host', 'bdp-zm-m.c.msca-bdp-student-ap.internal'),
 ('spark.hadoop.mapreduce.fileoutputcommitter.concurrent.write.enabled',
  'false'),
 ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),
 ('spark.executor.instances', '2'),
 ('

### Tasks metadata description  
tasks.csv: Task metadata for series in the defog dataset. (Not relevant for the series in the tdcsfog or daily datasets.)    
&emsp; 1. Id: The data series where the task was measured.  
&emsp; 2. Begin: Time (s) the task began.  
&emsp; 3. End: Time (s) the task ended.  
&emsp; 4. Task: One of seven tasks types in the DeFOG protocol, described on this page.  


In [3]:
tasks_path = "gs://msca-bdp-student-gcs/parkinsons_data/tasks.csv"
tasks = spark.read.csv(tasks_path, header=True)

                                                                                

1. convert times from string to double
2. rename Type column to TypeName
3. create dummy variables
4. merge with fog dataset

In [4]:
# convert times to double
tasks = tasks.withColumn("Begin",tasks.Begin.cast('double')) \
    .withColumn("End",tasks.End.cast('double'))

In [5]:
# rename tasks
tasks = tasks.withColumnRenamed("Task","TaskType")

In [6]:
# count occurrences of each task to compare later
task_counts = tasks.groupBy("TaskType").count().sort(F.desc("count")).collect()

task_dict ={}
for task, count in task_counts:
    task_dict[task] = count

                                                                                

In [7]:
# create dummy vars
tasks_dummy = create_dummies(tasks, "TaskType")

                                                                                

In [8]:
# check that we retained original counts
test_count = tasks_dummy.groupBy().sum().collect()
test_dict = test_count[0].asDict()
for k, v in test_dict.items():
    col = k[4:-1]
    if col != "Begin" and col != "End":
        if v != task_dict[col]:
            print("We have retained the original counts:", False)
print("We have retained the original counts:", True)

23/05/18 21:35:20 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 12:>                                                         (0 + 1) / 1]

We have retained the original counts: True


                                                                                

In [9]:
tasks_dummy.show(5)

+----------+------+------+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|        Id| Begin|   End|TaskType|MB9|Rest1|MB6-L|MB6-R|Turning-C|MB2a|MB3-L|MB12|MB5|MB3-R|MB13|TUG-DT|Turning-ST|TUG-ST|4MW-C|Hotspot2|MB6|TUG-C|4MW|Hotspot1-C|Hotspot2-C|MB8|Hotspot1|MB4|MB1|MB7|Rest2|MB2b|MB10|Turning-DT|MB11|
+----------+------+------+--------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+
|02ab235146|  10.0|190.48|   Rest1|  0|    1|    0|    0|        0|   0|    0|   0|  0|    0|   0|     0|         0|     0|    0|       0|  0|    0|  0|         0|         0|  0|       0|  0|  0|  0|    0|   0|   0|         0|   0|
|02ab235146|211.24|271.56|   Rest2|  0|    0|    0|    0|        0|   0|

In [11]:
# load in fog
fog = spark.read.parquet("gs://msca-bdp-student-gcs/parkinsons_data/train/processed/defog")

In [12]:
# convert fog TimeSeconds to float
fog = fog.withColumn("TimeSeconds",fog.TimeSeconds.cast('double'))

In [13]:
# join fog with tasks
cond = (tasks_dummy.Id == fog.Id) & (fog.TimeSeconds.between(tasks_dummy.Begin, tasks_dummy.End))
fog_tasks = fog \
    .join(tasks_dummy, how='left', on=cond)\
    .drop(tasks_dummy.Id)
fog_tasks = fog_tasks.na.fill(0)

In [14]:
fog_count = fog.count()
fog_tasks_count = fog_tasks.count()

                                                                                

In [16]:
print(f"Are counts same after join? fog taks count: {fog_tasks_count}, fog count {fog_count} -> { fog_tasks_count == fog_count}")

Are counts same after join? fog taks count: 13525703, fog count 13525702 -> False


In [20]:
fog_tasks1 = transform_target(fog_tasks)

In [21]:
fog_tasks1_count = fog_tasks1.count()

                                                                                

In [22]:
fog_tasks1_count

13525703

In [23]:
print(f"Are counts same after transform? -> { fog_tasks1_count == fog_count}")

Are counts same after transform? -> False


In [33]:
if fog_tasks1_count != fog_count:
    print(f"WARNING:  SAVING, BUT ROW COUNT NOT PRESERVED. Original: {fog_count}, Saved: {fog_tasks1_count}")
fog_tasks1.write.format("parquet").mode("overwrite").save("gs://msca-bdp-student-gcs/parkinsons_data/train/processed/defog_tasks")



                                                                                