# 混洗（Shuffling）机制

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
# 本地模式
spark = SparkSession.builder.\
    master("local[*]").\
    appName("Shuffling").\
    getOrCreate()
sc = spark.sparkContext
# sc.setLogLevel("ERROR")
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x00000201EA0571C0>
<SparkContext master=local[*] appName=Shuffling>


In [2]:
import numpy as np
np.exp(1000.0) # 报错

  np.exp(1000.0) # 报错


inf

In [3]:
import numpy as np
x = np.array([-2, -1, 3, 4, 5])
print(x > 0)
a = np.array([1, 2, 3, 4, 5])
b = np.array([-1, -2, -3, -4, -5])
np.where(x > 0, a, b) #R语言中ifelse

[False False  True  True  True]


array([-1, -2,  3,  4,  5])

创建一个简单的 RDD：

In [4]:
import string

rdd = sc.parallelize(string.ascii_uppercase, numSlices=5) # 这里定义分成5份
print(rdd.collect())

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [5]:
rdd.getNumPartitions()

5

将每个分区的字母合并成一个字符串，从而可以知道分区是如何划分的：

In [6]:
def concat_letters(iter):
    yield "".join(iter)

rdd.mapPartitions(concat_letters).collect() # 基本上按照等量原则

['ABCDE', 'FGHIJ', 'KLMNO', 'PQRST', 'UVWXYZ']

将 RDD 重新分区，可以看出来字母的顺序产生了变化：

In [7]:
rdd2 = rdd.repartition(6)
print(rdd2.collect())

['A', 'B', 'C', 'D', 'E', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'K', 'L', 'M', 'N', 'O', 'F', 'G', 'H', 'I', 'J']


In [8]:
rdd2.mapPartitions(concat_letters).collect() # 数据比较小的时候，随机进行划分
# 注意算法是否依赖数据顺序

['', 'ABCDE', 'PQRSTUVWXYZ', '', 'KLMNO', 'FGHIJ']

使用 `toDebugString()` 查看 RDD 包含的操作：

In [9]:
print(rdd.toDebugString().decode("UTF-8"))
print("\n\n")
print(rdd2.toDebugString().decode("UTF-8"))
# 增加分区数很容易进行混洗

(5) ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []



(6) MapPartitionsRDD[6] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  CoalescedRDD[5] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  ShuffledRDD[4] at coalesce at NativeMethodAccessorImpl.java:0 []
 +-(5) MapPartitionsRDD[3] at coalesce at NativeMethodAccessorImpl.java:0 []
    |  PythonRDD[2] at RDD at PythonRDD.scala:53 []
    |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []


可以看出来 `rdd2` 包含了混洗（shuffling）操作，这也是数据顺序打乱的原因。

如果需要减小分区数目，可以使用 `coalesce()` 函数，避免混洗操作：

In [10]:
rdd3 = rdd.coalesce(numPartitions=2, shuffle=False)
print(rdd3.collect())
rdd3.mapPartitions(concat_letters).collect()

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


['ABCDEFGHIJ', 'KLMNOPQRSTUVWXYZ']

In [11]:
print(rdd3.toDebugString().decode("UTF-8"))

(2) CoalescedRDD[8] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []


如果确实需要增加分区数目，同时希望保持数据顺序，可以在原始 RDD 中增加索引信息：

In [12]:
print(rdd.zipWithIndex().collect()) # 如果想略过多少行，可以使用filter进行操作

[('A', 0), ('B', 1), ('C', 2), ('D', 3), ('E', 4), ('F', 5), ('G', 6), ('H', 7), ('I', 8), ('J', 9), ('K', 10), ('L', 11), ('M', 12), ('N', 13), ('O', 14), ('P', 15), ('Q', 16), ('R', 17), ('S', 18), ('T', 19), ('U', 20), ('V', 21), ('W', 22), ('X', 23), ('Y', 24), ('Z', 25)]


In [15]:
rdd4 = rdd.zipWithIndex().repartition(5)
print(rdd4.collect()) 

[('A', 0), ('B', 1), ('C', 2), ('D', 3), ('E', 4), ('K', 10), ('L', 11), ('M', 12), ('N', 13), ('O', 14), ('P', 15), ('Q', 16), ('R', 17), ('S', 18), ('T', 19), ('F', 5), ('G', 6), ('H', 7), ('I', 8), ('J', 9), ('U', 20), ('V', 21), ('W', 22), ('X', 23), ('Y', 24), ('Z', 25)]


然后定义一个分区映射函数，在合并数据行时获取分区中第一条数据的索引：

In [16]:
def concat_letters_with_order(iter): # 操作很复杂，对Index进行排序，保留第一个数据的索引
    letters_and_indices = list(iter)
    letters = map(lambda x: x[0], letters_and_indices)
    indices = map(lambda x: x[1], letters_and_indices)
    if len(letters_and_indices) < 1:
        yield ()
    else:
        first_ind = next(indices)
        combined_letters = "".join(letters)
        yield combined_letters, first_ind

rdd5 = rdd4.mapPartitions(concat_letters_with_order)
print(rdd5.collect())

[(), ('ABCDE', 0), ('KLMNOPQRST', 10), ('FGHIJ', 5), ('UVWXYZ', 20)]


然后过滤空分区，并按生成的索引对 RDD 进行排序：

In [17]:
rdd5.filter(lambda x: len(x) > 1).collect()

[('ABCDE', 0), ('KLMNOPQRST', 10), ('FGHIJ', 5), ('UVWXYZ', 20)]

In [18]:
rdd5.filter(lambda x: len(x) > 1).sortBy(lambda x: x[1], ascending=True).collect()

[('ABCDE', 0), ('FGHIJ', 5), ('KLMNOPQRST', 10), ('UVWXYZ', 20)]