In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [2]:
conf = SparkConf().setAppName("RDD_doc_notes")
sc = SparkContext.getOrCreate(conf=conf)

In [21]:
sc

In [22]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
distData.map(lambda x: "*" * x).reduce(lambda a, b: a + b)

'***************'

In [4]:
rdd = sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))
rdd

PythonRDD[3] at RDD at PythonRDD.scala:53

In [6]:
rdd.saveAsSequenceFile("path/to/file")

In [7]:
sorted(sc.sequenceFile("path/to/file").collect())

[(1, 'a'), (2, 'aa'), (3, 'aaa')]

In [8]:
lines = sc.textFile("/home/nghiaht/pyspark/mastering-large-datasets/Ch07/*.txt")
lineLengths = lines.map(lambda s: len(s))
lineLengths.persist()

lineLengths

PythonRDD[15] at RDD at PythonRDD.scala:53

In [9]:
totalLength = lineLengths.reduce(lambda a, b: a + b)
totalLength

82364

In [10]:
lineLengths

PythonRDD[15] at RDD at PythonRDD.scala:53

In [17]:
def score(word):
    total = 0
    for i, char in enumerate(word):
        if char.lower() in "dlcu":
            total += 1
        elif char.lower() in "mwfbygpvk":
            total += 2
        elif char.lower() in "jxqz":
            total += 4
        if i >= 4:
            total += 2
    return total


import re

PAT = re.compile(r"[-./:\s\xa0]+")

spark = SparkSession(sc)
spark

In [19]:
lines
xs = (
    lines.flatMap(lambda x: PAT.split(x))
    .filter(lambda x: len(x) > 10)
    .map(lambda x: (x, score(x)))
)
xs
print(xs.collect())

[('Reflections', 18), ('Skyscrapers', 21), ('confessions', 17), ("Something's", 18), ('entertainers,', 18), ('Truthfully,', 22), ("Everybody's", 23), ("Everybody's", 23), ("Everybody's", 23), ("everywhere's", 22), ('proclamations', 24), ('opportunities', 23), ('apprehended', 20), ('“apprehended”', 24), ('“apprehended”', 24), ('Christendom', 18), ('satisfaction', 19), ('irreligious', 18), ('irreligious', 18), ('discussions', 17), ('interminable', 21), ('conversations', 21), ('improvement', 22), ('themselves,', 19), ('Shakespeare’s', 22), ('contemptible', 24), ('“ministering”', 22), ('organisation', 18), ('impossible,', 21), ('associations', 17), ('intolerable', 18), ('unconditional', 22), ('Institutions,', 19), ('responsibility,', 29), ('possession_', 16), ('possession,', 16), ('discretion,', 16), ('intelligence,', 23), ('carelessness', 18), ('intelligence,', 23), ('_Mistresses_', 18), ('discretion,', 16), ('discretion?', 16), ('Mistresses,', 16), ('authority”?', 17), ('“authority”?', 1

In [20]:
df = spark.createDataFrame(xs, ["word", "score"])
df.show()

+-------------+-----+
|         word|score|
+-------------+-----+
|  Reflections|   18|
|  Skyscrapers|   21|
|  confessions|   17|
|  Something's|   18|
|entertainers,|   18|
|  Truthfully,|   22|
|  Everybody's|   23|
|  Everybody's|   23|
|  Everybody's|   23|
| everywhere's|   22|
|proclamations|   24|
|opportunities|   23|
|  apprehended|   20|
|“apprehended”|   24|
|“apprehended”|   24|
|  Christendom|   18|
| satisfaction|   19|
|  irreligious|   18|
|  irreligious|   18|
|  discussions|   17|
+-------------+-----+
only showing top 20 rows



In [34]:
# example accumlator

acc = sc.accumulator(0)
acc

Accumulator<id=2, value=0>

In [35]:
sc.parallelize((range(0, 10))).foreach(lambda x: acc.add(x))

In [36]:
acc

Accumulator<id=2, value=45>

In [37]:
acc.value

45

some transform and action

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [3]:
conf = SparkConf().setAppName("some transform and action")
sc = SparkContext.getOrCreate(conf=conf)

In [13]:
rdd = sc.parallelize(range(20), 2)

# sorted(rdd.glom().collect())

In [17]:
rdd.mapPartitions(lambda x: [i * 2 for i in x]).glom().collect()

[[0, 2, 4, 6, 8, 10, 12, 14, 16, 18], [20, 22, 24, 26, 28, 30, 32, 34, 36, 38]]

In [19]:
rdd.glom().collect()

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

In [68]:
r_dd = sc.parallelize(range(20))

In [45]:
sets = pairs.partitionBy(2).glom().collect()
sets

[[(2, '**'), (4, '****'), (2, '**'), (4, '****')],
 [(1, '*'), (3, '***'), (1, '*')]]

In [94]:
gb = sc.parallelize(["nghia", "airflow", "spark", "scala", "apache storm", "hadoop"])
result = gb.groupBy(lambda x: x[0]).collect()
[(x, tuple(y)) for (x, y) in result]

[('s', ('spark', 'scala')),
 ('a', ('airflow', 'apache storm')),
 ('h', ('hadoop',)),
 ('n', ('nghia',))]

In [95]:
rdd = sc.parallelize([1, 1, 2, 3, 5, 8])
result = rdd.groupBy(lambda x: x % 2).collect()
result
# sorted([(x, sorted(y)) for (x, y) in result])

[(0, <pyspark.resultiterable.ResultIterable at 0x7f54975b65e0>),
 (1, <pyspark.resultiterable.ResultIterable at 0x7f5497b22af0>)]

+---------------+-----+
|           name|bonus|
+---------------+-----+
|    James,Smith|300.0|
|      Anna,Rose|410.0|
|Robert,Williams|620.0|
+---------------+-----+

