In [1]:
"""
-Chaining : syntaxe en cascade, le RDD intermédiaire est annonymisé
Seq of execution : perform map (transformation) => store result in RDD => perform de reduce (action)
"""
# nombre de partition (cloisons) par default dans le SC = Le nombre de Workers
from pyspark import SparkContext
# spark context avec 4 workers
sc = SparkContext(master="local[4]")

In [3]:
B = sc.parallelize(range(4))
B.collect()

[0, 1, 2, 3]

In [4]:
# methode nulle
Squares = B.map(lambda x:x*x)
Squares.reduce(lambda x,y:x+y)

14

In [5]:
#methode chained : on ne voit pas le RDD intermédiaire !
B.map(lambda x:x*x)\
    .reduce(lambda x,y:x+y)

14

In [18]:
# creation d'une liste de taille 1 million (250000*4)
n = 1000000
B=sc.parallelize([1,2,3,4]*int(n/4))
print('Count elems : ',B.count())
print('First elem : ',B.first())
print('5 first elems : ', B.take(5))
# echantillons : 10/n = on dit que l'on va tirer en moyenne 10 elements
print('Sample RDD 1 : ',B.sample(False,10/n).collect())
print('Sample RDD 2 : ',B.sample(False,10/n).collect())

Count elems :  1000000
First elem :  1
5 first elems :  [1, 2, 3, 4, 1]
Sample RDD 1 :  [3, 2, 2, 3, 4, 2, 2, 4, 1, 4, 1, 4]
Sample RDD 2 :  [4, 4, 4, 3, 4, 3, 4, 1, 1, 3]


In [20]:
# filtrer un RDD
print('The number of elements >= 3 : ',B.filter(lambda x:x>=3).count())

The number of elements >= 3 :  500000


In [24]:
# enlever doublons
B = sc.parallelize([0,1,1,3,5,5,6])
print('Duplicate RDD : ', B.collect())
print('Distinct RDD : ', B.distinct().collect())

Duplicate RDD :  [0, 1, 1, 3, 5, 5, 6]
Distinct RDD :  [0, 1, 5, 6, 3]


In [28]:
# map ou flat map ? => ideal pour des données textuelles
T = sc.parallelize(['The world was young', 'the mountains green','the moon was seen','Far over the misty mountains cold', 'to dungeaons deep and caverns old','we must away and break a day to seek a pen and shires a rock'])
#map
print('Map function Split with spaces : ', T.map(lambda phrase:phrase.split(" ")).collect())
# flat map :
print('FlatMap function Split with spaces : ', T.flatMap(lambda phrase:phrase.split(" ")).collect())
#flatMap ne rend qu'une seule liste

Map function Split with spaces :  [['The', 'world', 'was', 'young'], ['the', 'mountains', 'green'], ['the', 'moon', 'was', 'seen'], ['Far', 'over', 'the', 'misty', 'mountains', 'cold'], ['to', 'dungeaons', 'deep', 'and', 'caverns', 'old'], ['we', 'must', 'away', 'and', 'break', 'a', 'day', 'to', 'seek', 'a', 'pen', 'and', 'shires', 'a', 'rock']]
Flatmap function Split with spaces :  ['The', 'world', 'was', 'young', 'the', 'mountains', 'green', 'the', 'moon', 'was', 'seen', 'Far', 'over', 'the', 'misty', 'mountains', 'cold', 'to', 'dungeaons', 'deep', 'and', 'caverns', 'old', 'we', 'must', 'away', 'and', 'break', 'a', 'day', 'to', 'seek', 'a', 'pen', 'and', 'shires', 'a', 'rock']


In [31]:
# union
rdd1 = sc.parallelize([1,56464,1216,46,4,31,36,1,31,31,3561654,96,4,631,23])
rdd2 = sc.parallelize([654,65,46,461,31,321,65,4,6,45])
print('Union of 2 RDDs : ',rdd1.union(rdd2).collect())
print('Union distinct of 2 RDDs : ',rdd1.union(rdd2).distinct().collect())

Union of 2 RDDs :  [1, 56464, 1216, 46, 4, 31, 36, 1, 31, 31, 3561654, 96, 4, 631, 23, 654, 65, 46, 461, 31, 321, 65, 4, 6, 45]
Union distinct of 2 RDDs :  [56464, 1216, 96, 1, 65, 321, 4, 36, 461, 45, 46, 3561654, 654, 6, 31, 631, 23]


In [32]:
#intersection
print('intersection of 2 RDDs : ',rdd1.intersection(rdd2).collect())

intersection of 2 RDDs :  [4, 46, 31]


In [33]:
#exclusion : subtract
print('subtract of RDD 1 of 2 : ',rdd1.subtract(rdd2).collect())

subtract of RDD 1 of 2 :  [56464, 1216, 96, 1, 1, 36, 3561654, 631, 23]


In [34]:
# produit cartesien => fait des paires
print('cartesian product of 2 RDDs: ',rdd1.cartesian(rdd2).collect())

cartesian product of 2 RDDs:  [(1, 654), (1, 65), (56464, 654), (56464, 65), (1216, 654), (1216, 65), (1, 46), (1, 461), (56464, 46), (56464, 461), (1216, 46), (1216, 461), (1, 31), (1, 321), (56464, 31), (56464, 321), (1216, 31), (1216, 321), (1, 65), (1, 4), (56464, 65), (56464, 4), (1216, 65), (1216, 4), (1, 6), (1, 45), (56464, 6), (56464, 45), (1216, 6), (1216, 45), (46, 654), (46, 65), (4, 654), (4, 65), (31, 654), (31, 65), (46, 46), (46, 461), (4, 46), (4, 461), (31, 46), (31, 461), (46, 31), (46, 321), (4, 31), (4, 321), (31, 31), (31, 321), (46, 65), (46, 4), (4, 65), (4, 4), (31, 65), (31, 4), (46, 6), (46, 45), (4, 6), (4, 45), (31, 6), (31, 45), (36, 654), (36, 65), (1, 654), (1, 65), (31, 654), (31, 65), (36, 46), (36, 461), (1, 46), (1, 461), (31, 46), (31, 461), (36, 31), (36, 321), (1, 31), (1, 321), (31, 31), (31, 321), (36, 65), (36, 4), (1, 65), (1, 4), (31, 65), (31, 4), (36, 6), (36, 45), (1, 6), (1, 45), (31, 6), (31, 45), (31, 654), (31, 65), (3561654, 654), (35