In [1]:
from pyspark import SparkContext

## Данные в Spark разбиваются на партиции. Партиция - это атомарная единица параллелизма. Но как именно происходит присваивание записи в RDD партиции на экзекьюторе?

![](pics/lesson01_02_shuffle_partitioning.png)

## Разбиение происходит с помощью объекта `partitioner`, который имеет следующий интерфейс:
* numPartitions - число партиций в RDD
* getPartition - возвращает соответствие ключа индексу партиции

In [5]:
nums = list(range(10))
nums

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [4]:
with SparkContext("local[1]") as sc:
    rdd = sc.parallelize(nums)
    
    print("Number of partitions: {}".format(rdd.getNumPartitions()))
    print("Partitioner: {}".format(rdd.partitioner))
    print("Partitions structure: {}".format(rdd.glom().collect()))

Number of partitions: 1
Partitioner: None
Partitions structure: [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]


## В случае отстутствия партишенера, разбиение данных не зависит от самих данных. Данные разбиваются равномерно, исходя из размера 

In [6]:
with SparkContext("local[2]") as sc:
    rdd = sc.parallelize(nums)
    
    print("Default parallelism: {}".format(sc.defaultParallelism))
    print("Number of partitions: {}".format(rdd.getNumPartitions()))
    print("Partitioner: {}".format(rdd.partitioner))
    print("Partitions structure: {}".format(rdd.glom().collect()))

Default parallelism: 2
Number of partitions: 2
Partitioner: None
Partitions structure: [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]


In [7]:
import random
random.shuffle(nums)
nums

[6, 1, 5, 4, 8, 0, 3, 9, 2, 7]

In [8]:
with SparkContext("local[2]") as sc:
    rdd = sc.parallelize(nums)
    
    print("Default parallelism: {}".format(sc.defaultParallelism))
    print("Number of partitions: {}".format(rdd.getNumPartitions()))
    print("Partitioner: {}".format(rdd.partitioner))
    print("Partitions structure: {}".format(rdd.glom().collect()))

Default parallelism: 2
Number of partitions: 2
Partitioner: None
Partitions structure: [[6, 1, 5, 4, 8], [0, 3, 9, 2, 7]]


## Что произойдет, если партиций больше, чем данных?

In [9]:
with SparkContext("local[1]") as sc:
    rdd = sc.parallelize(nums, 15)
    
    print("Default parallelism: {}".format(sc.defaultParallelism))
    print("Number of partitions: {}".format(rdd.getNumPartitions()))
    print("Partitioner: {}".format(rdd.partitioner))
    print("Partitions structure: {}".format(rdd.glom().collect()))

Default parallelism: 1
Number of partitions: 15
Partitioner: None
Partitions structure: [[], [6], [1], [], [5], [4], [], [8], [0], [], [3], [9], [], [2], [7]]


## Партишенеры работают с PairRDD

In [10]:
sc = SparkContext("local[2]")

In [11]:
rdd = sc.parallelize(nums) \
        .partitionBy(2) \
        .persist()
    
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

Number of partitions: 2
Partitioner: <pyspark.rdd.Partitioner object at 0x7fa9069f0080>


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/rdd.py", line 1771, in add_shuffle_key
    for k, v in iterator:
TypeError: cannot unpack non-iterable int object

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/Users/pklemenkov/anaconda3/envs/spark/lib/python3.7/site-packages/pyspark/rdd.py", line 1771, in add_shuffle_key
    for k, v in iterator:
TypeError: cannot unpack non-iterable int object

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [12]:
rdd = sc.parallelize(nums) \
        .map(lambda el: (el, el)) \
        .partitionBy(2) \
        .persist()
    
print("Number of partitions: {}".format(rdd.getNumPartitions()))
print("Partitioner: {}".format(rdd.partitioner))
print("Partitions structure: {}".format(rdd.glom().collect()))

Number of partitions: 2
Partitioner: <pyspark.rdd.Partitioner object at 0x7fa906a01898>
Partitions structure: [[(6, 6), (4, 4), (8, 8), (0, 0), (2, 2)], [(1, 1), (5, 5), (3, 3), (9, 9), (7, 7)]]


## Взглянем на партишенер по-умолчанию

In [23]:
import os
os.environ["PYTHONHASHSEED"] = "5757"

In [14]:
rdd.partitioner

<pyspark.rdd.Partitioner at 0x7fa906a01898>

https://github.com/apache/spark/blob/v2.4.7/python/pyspark/rdd.py#L165

In [15]:
rdd.partitioner.numPartitions

2

In [16]:
rdd.partitioner("Hello, world!")

1

In [17]:
rdd.partitioner.partitionFunc("Hello, world!")

9222607398166336219

## По-умолчанию, Spark использует HashPartitioner

In [18]:
from pyspark.rdd import portable_hash

num_partitions = 2
for el in nums:
    print("Element: [{}]: {} % {} = partition {}".format(
        el, portable_hash(el), num_partitions, portable_hash(el) % num_partitions))

Element: [6]: 6 % 2 = partition 0
Element: [1]: 1 % 2 = partition 1
Element: [5]: 5 % 2 = partition 1
Element: [4]: 4 % 2 = partition 0
Element: [8]: 8 % 2 = partition 0
Element: [0]: 0 % 2 = partition 0
Element: [3]: 3 % 2 = partition 1
Element: [9]: 9 % 2 = partition 1
Element: [2]: 2 % 2 = partition 0
Element: [7]: 7 % 2 = partition 1


In [19]:
sc.stop()

## Более реалистичный пример

In [20]:
transactions = [
    {'name': 'Bob', 'amount': 100, 'country': 'United Kingdom'},
    {'name': 'James', 'amount': 15, 'country': 'United Kingdom'},
    {'name': 'Marek', 'amount': 51, 'country': 'Poland'},
    {'name': 'Johannes', 'amount': 200, 'country': 'Germany'},
    {'name': 'Paul', 'amount': 75, 'country': 'Poland'},
]

In [25]:
def country_partitioner(country):
    return hash(country)

In [28]:
with SparkContext("local[2]") as sc:
    rdd = sc.parallelize(transactions) \
            .map(lambda el: (el['country'], el)) \
            .partitionBy(3, country_partitioner)
    
    for k in ("United Kingdom", "Germany", "Poland"):
        print(f"Key={k}, partition={rdd.partitioner(k)}")
    
    print("Number of partitions: {}".format(rdd.getNumPartitions()))
    print("Partitioner: {}".format(rdd.partitioner))
    print("Partitions structure: {}".format(rdd.glom().collect()))

Key=United Kingdom, partition=2
Key=Germany, partition=0
Key=Poland, partition=2
Number of partitions: 3
Partitioner: <pyspark.rdd.Partitioner object at 0x7fa906a0fcc0>
Partitions structure: [[('United Kingdom', {'name': 'Bob', 'amount': 100, 'country': 'United Kingdom'}), ('United Kingdom', {'name': 'James', 'amount': 15, 'country': 'United Kingdom'}), ('Germany', {'name': 'Johannes', 'amount': 200, 'country': 'Germany'})], [('Poland', {'name': 'Marek', 'amount': 51, 'country': 'Poland'}), ('Poland', {'name': 'Paul', 'amount': 75, 'country': 'Poland'})], []]


## Правильное использование партишенеров может сильно ускорить выполнение ваших программ!

## Важным понятием является "known partitioner", т.е. явно заданный партишенер. По-умолчанию, партишенер не задается

In [29]:
keys = ['a'] * 1000 + ['b'] * 1000 + ['c'] * 1000 + ['d'] * 1000
values_left = [1] * 1000 + [2] * 1000 + [3] * 1000 + [4] * 1000
values_right = [5] * 1000 + [6] * 1000 + [7] * 1000 + [8] * 1000

In [30]:
from pyspark import SparkConf

In [31]:
conf = SparkConf().set("spark.default.parallelism", "10")
sc = SparkContext("local[2]", appName="Optimization", conf=conf)

In [32]:
sc.defaultParallelism

10

In [33]:
left_rdd = sc.parallelize(zip(keys, values_left))

In [34]:
left_rdd.getNumPartitions()

10

In [36]:
print(left_rdd.partitioner)

None


In [37]:
left_rdd.take(5)

[('a', 1), ('a', 1), ('a', 1), ('a', 1), ('a', 1)]

In [38]:
right_rdd = sc.parallelize(zip(keys, values_right))

In [39]:
right_rdd.getNumPartitions()

10

In [40]:
right_rdd.partitioner

In [41]:
right_rdd.take(5)

[('a', 5), ('a', 5), ('a', 5), ('a', 5), ('a', 5)]

In [42]:
left_rdd.join(right_rdd).count()

4000000

In [43]:
left_rdd.join(right_rdd).getNumPartitions()

10

In [45]:
sc.stop()

## Если у двух RDD партишенеры явно заданы и они равны, то такие RDD называются ко-партиционированными

![](pics/lesson01_02_shuffle_partitioning_03.png)

In [46]:
sc = SparkContext("local[2]", appName="Optimization", conf=conf)

In [47]:
sc.defaultParallelism

10

In [48]:
left_rdd = sc.parallelize(zip(keys, values_left))

In [49]:
left_rdd = left_rdd.partitionBy(4)

In [50]:
left_rdd.partitioner

<pyspark.rdd.Partitioner at 0x7fa9087db780>

In [51]:
left_rdd.take(5)

[('d', 4), ('d', 4), ('d', 4), ('d', 4), ('d', 4)]

In [52]:
right_rdd = sc.parallelize(zip(keys, values_right))

In [53]:
right_rdd = right_rdd.partitionBy(4)

In [54]:
right_rdd.partitioner

<pyspark.rdd.Partitioner at 0x7fa9087eeba8>

In [55]:
right_rdd.take(5)

[('d', 8), ('d', 8), ('d', 8), ('d', 8), ('d', 8)]

In [56]:
left_rdd.partitioner == right_rdd.partitioner

True

In [57]:
left_rdd.join(right_rdd).count()

4000000

In [58]:
left_rdd.join(right_rdd).getNumPartitions()

10

In [59]:
sc.stop()

## Необходимо следить за партишенерами в широких трансформациях

In [60]:
sc = SparkContext("local[2]", appName="Optimization", conf=conf)

In [61]:
sc.defaultParallelism

10

In [62]:
left_rdd = sc.parallelize(zip(keys, values_left))

In [63]:
left_rdd = left_rdd.partitionBy(4)

In [64]:
left_rdd.partitioner

<pyspark.rdd.Partitioner at 0x7fa908812518>

In [65]:
left_rdd.count()

4000

In [66]:
right_rdd = sc.parallelize(zip(keys, values_right))

In [67]:
right_rdd = right_rdd.partitionBy(4)

In [68]:
right_rdd.partitioner

<pyspark.rdd.Partitioner at 0x7fa9087d5fd0>

In [69]:
right_rdd.count()

4000

In [70]:
left_rdd.partitioner == right_rdd.partitioner

True

In [71]:
left_rdd.join(right_rdd, numPartitions=4).count()

4000000

In [73]:
left_rdd.join(right_rdd, numPartitions=4).getNumPartitions()

4

## Сохранение партишенеров между трансформациями особенно полезно для широких трансформаций, потому что помогают снизить объем шаффла или вовсе его исключить

## Особенность, правда, заключается еще и в том, что узкие трансформации способны подгадить!

In [75]:
left_rdd.partitioner

<pyspark.rdd.Partitioner at 0x7fa908812518>

In [81]:
left_rdd_identity = left_rdd.map(lambda x: x, preservesPartitioning=True)

In [83]:
left_rdd_identity = left_rdd.mapValues(lambda x: x)

In [84]:
print(left_rdd_identity.partitioner)

<pyspark.rdd.Partitioner object at 0x7fa908812518>


In [79]:
left_rdd_identity.partitioner == right_rdd.partitioner

False

In [80]:
left_rdd_identity.join(right_rdd).count()

4000000

In [None]:
left_rdd_identity.join(right_rdd).partitioner == right_rdd.partitioner

## Это происходить из-за того, что трансформации типа `map()` могут изменять не только значения PairRDD, но и ключи! Выходом из ситуации может быть либо использования трансформаций над значениями, например `mapValues()`, либо передача параметра `preservesPartitioning=True`

In [None]:
left_rdd_identity = left_rdd.map(lambda x: (x[0], x[1]), preservesPartitioning=True)

In [None]:
left_rdd_identity.partitioner

In [None]:
left_rdd_identity.partitioner == right_rdd.partitioner

In [None]:
left_rdd_identity.join(right_rdd).count()

In [None]:
left_rdd_identity.join(right_rdd, numPartitions=4).partitioner == right_rdd.partitioner

In [None]:
sc.stop()