Skip to content

Commit

Permalink
add RDD name in debug output when running a task
Browse files Browse the repository at this point in the history
  • Loading branch information
svenkreiss committed May 21, 2017
1 parent 5a861cd commit b807133
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 3 deletions.
5 changes: 5 additions & 0 deletions pysparkling/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def _run_task(task_context, rdd, func, partition):
"""
task_context.attempt_number += 1

log.debug('Running stage {} for partition {} of {}.'
''.format(task_context.stage_id,
task_context.partition_id,
rdd.name()))

try:
return func(task_context, rdd.compute(partition, task_context))
except Exception:
Expand Down
3 changes: 3 additions & 0 deletions pysparkling/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,9 @@ def min(self):

def name(self):
"""returns the name of the dataset"""
if self._name is None:
return 'RDD_{}'.format(self._rdd_id)

return self._name

def partitionBy(self, numPartitions, partitionFunc=None):
Expand Down
3 changes: 0 additions & 3 deletions pysparkling/task_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
class TaskContext(object):
def __init__(self, stage_id=0, partition_id=0,
max_retries=3, retry_wait=0):
log.debug('Running stage {0} for partition {1}'
''.format(stage_id, partition_id))

self.stage_id = stage_id
self.partition_id = partition_id
self.max_retries = max_retries
Expand Down

0 comments on commit b807133

Please sign in to comment.