In [2]:
#For Local Machine
import findspark as fs
fs.init()
fs.find()

'C:\\pyspark'

In [3]:
#Creating SPARK session
from pyspark.sql import SparkSession

spark=SparkSession. \
builder. \
appName('pySpark RDD W/ Pictures'). \
master('local'). \
getOrCreate()

In [4]:
sc=spark.sparkContext

In [5]:
sc

In [6]:
sc.getConf().getAll()

[(u'spark.master', u'local'),
 (u'spark.app.id', u'local-1589372449715'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.app.name', u'pySpark RDD W/ Pictures'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.driver.host', u'DESKTOP-249MU1B.mshome.net'),
 (u'spark.executor.id', u'driver'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.driver.port', u'52566'),
 (u'spark.ui.showConsoleProgress', u'true')]

In [7]:
RDD=sc.parallelize([])

In [8]:
sc.parallelize?

[1;31mSignature:[0m [0msc[0m[1;33m.[0m[0mparallelize[0m[1;33m([0m[0mc[0m[1;33m,[0m [0mnumSlices[0m[1;33m=[0m[0mNone[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Distribute a local Python collection to form an RDD. Using xrange
is recommended if the input represents a range for performance.

>>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
[[0], [2], [3], [4], [6]]
>>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
[[], [0], [], [2], [4]]
[1;31mFile:[0m      c:\pyspark\python\pyspark\context.py
[1;31mType:[0m      instancemethod


### Introduction

* Each mnemonic uses blue rectangles to represent elements in the original RDD (resilient distributed dataset).
* The left side represents the input RDD and the right side represents the output.
* Elements in the output RDD may contain original elements (blue), elements with potentially different values (purple), or elements with potentially different data types (orange).
* In some cases, the output is a python object on the driver (dotted rectangle).
* When key-value pairs are critical to the operation, the 'key' is represented by a black square in the upper left corner, and the 'value' is represented by the remaining rectangle.
* User defined functions are represented by a green rectangle.
* When relevant to the core operation, partitions are represented by diagonal lines to the left of the RDD.  

<a>
<img align=left src="Share/files/images/pyspark-page2.svg" width=500 height=500>
</a>

<a href="http://spark.apache.org/docs/2.4.5/api/python/pyspark.html#pyspark.RDD.map">
<img align=left src="Share/files/images/pyspark-page3.svg" width=500 height=500 />
</a>

In [None]:
RDD.map?

In [None]:
# map1
x = sc.parallelize([1,2,3]) # sc = spark context, parallelize creates an RDD from the passed object
y = x.map(lambda x: (x,x**2))
print(x.collect())  # collect copies RDD elements to a list on the driver
print(y.collect())

In [None]:
# map2
x=sc.parallelize(['b','a','c'])
y=sorted(x.map(lambda x:(x,1)).collect())
print(y)

In [None]:
sorted?

In [None]:
RDD.collect?

<a href="http://spark.apache.org/docs/2.4.5/api/python/pyspark.html#pyspark.RDD.flatMap">
<img align=left src="Share/files/images/pyspark-page4.svg" width=500 height=500 />
</a>

In [None]:
RDD.flatMap?

In [None]:
# flatMap1
x = sc.parallelize([1,2,3])
y = x.flatMap(lambda x: (x, 100*x, x**2))
print(x.collect())
print(y.collect())

In [None]:
#flatMap2
x=sc.parallelize([2,3,4])
y=x.flatMap(lambda x:range(1,x)).collect()
print(y)

<a href="http://spark.apache.org/docs/2.4.5/api/python/pyspark.html#pyspark.RDD.mapPartitions">
<img align=left src="Share/files/images/pyspark-page5.svg" width=500 height=500 />
</a>

In [None]:
RDD.mapPartitions?

In [None]:
# mapPartitions1
x = sc.parallelize([1,2,3,4], 2)
def f(iterator): yield sum(iterator)
y = x.mapPartitions(f)
print(x.glom().collect())  # glom() flattens elements on the same partition
print(y.glom().collect())

In [None]:
RDD.glom?

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.mapPartitionsWithIndex">
<img align=left src="Share/files/images/pyspark-page6.svg" width=500 height=500 />
</a>

In [None]:
RDD.mapPartitionsWithIndex?

In [None]:
# mapPartitionsWithIndex
x = sc.parallelize([1,2,3,4], 2)
def f(partitionIndex, iterator): yield (partitionIndex,sum(iterator))
y = x.mapPartitionsWithIndex(f)
print(x.glom().collect())  # glom() flattens elements on the same partition
print(y.glom().collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.getNumPartitions">
<img align=left src="Share/files/images/pyspark-page7.svg" width=500 height=500 />
</a>

In [7]:
# getNumPartitions
x = sc.parallelize([1,2,3,4], 3)
y = x.getNumPartitions()
print(x.glom().collect())
print(y)

[[1], [2], [3, 4]]
3


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.filter">
<img align=left src="Share/files/images/pyspark-page8.svg" width=500 height=500 />
</a>

In [8]:
RDD.filter?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mfilter[0m[0;34m([0m[0mf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a new RDD containing only the elements that satisfy a predicate.

>>> rdd = sc.parallelize([1, 2, 3, 4, 5])
>>> rdd.filter(lambda x: x % 2 == 0).collect()
[2, 4]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [9]:
# filter
x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1)  # filters out even elements
print(x.collect())
print(y.collect())

[1, 2, 3]
[1, 3]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.distinct">
<img align=left src="Share/files/images/pyspark-page9.svg" width=500 height=500 />
</a>

In [10]:
RDD.distinct?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mdistinct[0m[0;34m([0m[0mnumPartitions[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a new RDD containing the distinct elements in this RDD.

>>> sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())
[1, 2, 3]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [12]:
# distinct
x = sc.parallelize(['A','A','B'])
y = x.distinct()
print(x.collect())
print(y.collect())

['A', 'A', 'B']
['A', 'B']


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sample">
<img align=left src="Share/files/images/pyspark-page10.svg" width=500 height=500 />
</a>

In [27]:
# sample
x = sc.parallelize(range(7))
ylist = [x.sample(withReplacement=False, fraction=0.5) for i in range(5)] # call 'sample' 5 times
print('x = ' + str(x.collect()))
for cnt,y in zip(range(len(ylist)), ylist):
    print('sample:' + str(cnt) + ' y = ' +  str(y.collect()))

x = [0, 1, 2, 3, 4, 5, 6]
sample:0 y = [6]
sample:1 y = [1, 4, 5]
sample:2 y = [0, 3, 4, 5]
sample:3 y = [0, 2, 3, 5]
sample:4 y = [1, 3, 4, 5, 6]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.takeSample">
<img align=left src="Share/files/images/pyspark-page11.svg" width=500 height=500 />
</a>

In [17]:
RDD.takeSample?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mtakeSample[0m[0;34m([0m[0mwithReplacement[0m[0;34m,[0m [0mnum[0m[0;34m,[0m [0mseed[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a fixed-size sampled subset of this RDD.

.. note:: This method should only be used if the resulting array is expected
    to be small, as all the data is loaded into the driver's memory.

>>> rdd = sc.parallelize(range(0, 10))
>>> len(rdd.takeSample(True, 20, 1))
20
>>> len(rdd.takeSample(False, 5, 2))
5
>>> len(rdd.takeSample(False, 15, 3))
10
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [16]:
# takeSample
x = sc.parallelize(range(7))
ylist = [x.takeSample(withReplacement=False, num=3) for i in range(5)]  # call 'sample' 5 times
print('x = ' + str(x.collect()))
for cnt,y in zip(range(len(ylist)), ylist):
    print('sample:' + str(cnt) + ' y = ' +  str(y))  # no collect on y

x = [0, 1, 2, 3, 4, 5, 6]
sample:0 y = [3, 0, 1]
sample:1 y = [2, 0, 1]
sample:2 y = [2, 4, 0]
sample:3 y = [2, 3, 1]
sample:4 y = [6, 5, 1]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.union">
<img align=left src="Share/files/images/pyspark-page12.svg" width=500 height=500 />
</a>

In [20]:
RDD.union?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0munion[0m[0;34m([0m[0mother[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the union of this RDD and another one.

>>> rdd = sc.parallelize([1, 1, 2, 3])
>>> rdd.union(rdd).collect()
[1, 1, 2, 3, 1, 1, 2, 3]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [None]:
RDD>union

In [22]:
# union
x = sc.parallelize(['A','A','B'])
y = sc.parallelize(['D','C','A'])
print(x.union(y).collect())

['A', 'A', 'B', 'D', 'C', 'A']


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.intersection">
<img align=left src="Share/files/images/pyspark-page13.svg" width=500 height=500 />
</a>

In [26]:
RDD.intersection?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mintersection[0m[0;34m([0m[0mother[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the intersection of this RDD and another one. The output will
not contain any duplicate elements, even if the input RDDs did.

.. note:: This method performs a shuffle internally.

>>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
>>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
>>> rdd1.intersection(rdd2).collect()
[1, 2, 3]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [29]:
# intersection
x = sc.parallelize(['A','A','B'])
y = sc.parallelize(['A','B','D'])
z = x.intersection(y).collect()
print(x.collect())
print(y.collect())
print(z)

['A', 'A', 'B']
['A', 'B', 'D']
['A', 'B']


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sortByKey">
<img align=left src="Share/files/images/pyspark-page14.svg" width=500 height=500 />
</a>

In [31]:
RDD.sortByKey?

[0;31mSignature:[0m
[0mRDD[0m[0;34m.[0m[0msortByKey[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mascending[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnumPartitions[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeyfunc[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mRDD[0m[0;34m.[0m[0;34m<[0m[0;32mlambda[0m[0;34m>[0m [0mat[0m [0;36m0x7f3868036488[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Sorts this RDD, which is assumed to consist of (key, value) pairs.

>>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
>>> sc.parallelize(tmp).sortByKey().first()
('1', 3)
>>> sc.parallelize(tmp).sortByKey(True, 1).collect()
[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
>>> sc.parallelize(tmp).sortByKey(True, 2).collect()
[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
>>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]

In [48]:
# sortByKey1
x = sc.parallelize([('1',1),('a',2),('C',3)])
y = x.sortByKey()
print(x.collect())
print(y.collect())

[('1', 1), ('a', 2), ('C', 3)]
[('1', 1), ('C', 3), ('a', 2)]


In [45]:
# sprtByKey2
tmp=[('Marry',1),('had',2),('a',3),('little',4),('lamb',5)]
tmp.extend([('fleece',6)])
x=sc.parallelize(tmp)
print(x.sortByKey(True,3).collect())
print(x.sortByKey(True,3,keyfunc=lambda k: k.lower()).collect())

In [58]:
tmp.extend?

[0;31mDocstring:[0m L.extend(iterable) -> None -- extend list by appending elements from the iterable
[0;31mType:[0m      builtin_function_or_method


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sortBy">
<img align=left src="Share/files/images/pyspark-page15.svg" width=500 height=500 />
</a>

In [32]:
RDD.sortBy?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0msortBy[0m[0;34m([0m[0mkeyfunc[0m[0;34m,[0m [0mascending[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mnumPartitions[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Sorts this RDD by the given keyfunc

>>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
>>> sc.parallelize(tmp).sortBy(lambda x: x[0]).collect()
[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
>>> sc.parallelize(tmp).sortBy(lambda x: x[1]).collect()
[('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [57]:
# sortBy
x = sc.parallelize(['Cat','Apple','Bat'])
y = x.sortBy(lambda k: k[0]).collect()
print(y)

['Apple', 'Bat', 'Cat']


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.glom">
<img align=left src="Share/files/images/pyspark-page16.svg" width=500 height=500 />
</a>

In [60]:
RDD.glom?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mglom[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return an RDD created by coalescing all elements within each partition
into a list.

>>> rdd = sc.parallelize([1, 2, 3, 4], 2)
>>> sorted(rdd.glom().collect())
[[1, 2], [3, 4]]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [65]:
# glom
x = sc.parallelize(['C','B','A'], 2)
y = x.glom()
print(x.collect()) 
print(y.collect())

['C', 'B', 'A']
[['C'], ['B', 'A']]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.cartesian">
<img align=left src="Share/files/images/pyspark-page17.svg" width=500 height=500 />
</a>

In [67]:
RDD.cartesian?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mcartesian[0m[0;34m([0m[0mother[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the Cartesian product of this RDD and another one, that is, the
RDD of all pairs of elements C{(a, b)} where C{a} is in C{self} and
C{b} is in C{other}.

>>> rdd = sc.parallelize([1, 2])
>>> sorted(rdd.cartesian(rdd).collect())
[(1, 1), (1, 2), (2, 1), (2, 2)]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [66]:
# cartesian
x = sc.parallelize(['A','B'])
y = sc.parallelize(['C','D'])
z = x.cartesian(y)
print(x.collect())
print(y.collect())
print(z.collect())

['A', 'B']
['C', 'D']
[('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D')]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.groupBy">
<img align=left src="Share/files/images/pyspark-page18.svg" width=500 height=500 />
<

In [71]:
RDD.groupBy?

[0;31mSignature:[0m
[0mRDD[0m[0;34m.[0m[0mgroupBy[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnumPartitions[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpartitionFunc[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mportable_hash[0m [0mat[0m [0;36m0x7f3870378ea0[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return an RDD of grouped items.

>>> rdd = sc.parallelize([1, 1, 2, 3, 5, 8])
>>> result = rdd.groupBy(lambda x: x % 2).collect()
>>> sorted([(x, sorted(y)) for (x, y) in result])
[(0, [2, 8]), (1, [1, 1, 3, 5])]
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [82]:
x=sc.parallelize([1,1,2,3,5,8])
result=x.groupBy(lambda x: 'A' if x%2==0 else 'B').collect()
print(sorted([(x, sorted(y)) for (x, y) in result]))

[('A', [2, 8]), ('B', [1, 1, 3, 5])]


In [13]:
# groupBy
x = sc.parallelize([1,2,3])
y = x.groupBy(lambda x: 'A' if (x%2 == 0) else 'B' ).collect()
print(x.collect())
print([(j[0],[i for i in j[1]]) for j in y])# y is nested, this iterates through it
#different way
print(sorted([(x,sorted(y)) for (x,y) in y]))

[1, 2, 3]
[('B', [1, 3]), ('A', [2])]
[('A', [2]), ('B', [1, 3])]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.pipe">
<img align=left src="Share/files/images/pyspark-page19.svg" width=500 height=500 />
</a>

In [89]:
RDD.pipe?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mpipe[0m[0;34m([0m[0mcommand[0m[0;34m,[0m [0menv[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mcheckCode[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return an RDD created by piping elements to a forked external process.

>>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()
['1', '2', '', '3']

:param checkCode: whether or not to check the return value of the shell command.
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [17]:
#Linux
x=sc.parallelize(['1','2','','3'])
y=x.pipe('grep "3"')
print(y.collect())

['3']


## Actions & Transformations difference
Map is a transformation, thus when you perform a map you apply a function to each element in the RDD and return a new RDD where additional transformations or actions can be called.

Foreach is an action, it takes each element and applies a function, but it does not return a value. This is particularly useful in you have to call perform some calculation on an RDD and log the result somewhere else, for example a database or call a REST API with each element in the RDD.

For example let's say that you have an RDD with many queries that you wish to log in another system. The queries are stored in an RDD.

Then you want to save those queries in another system via a call to another 

Now you have executed the log_query on each element of the RDD. If you have done a map, nothing would have happened yet, until you called an action.

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.foreach">
<img align=left src="Share/files/images/pyspark-page20.svg" width=500 height=500 />
</a>

In [7]:
RDD.foreach?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mforeach[0m[0;34m([0m[0mf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Applies a function to all elements of this RDD.

>>> def f(x): print(x)
>>> sc.parallelize([1, 2, 3, 4, 5]).foreach(f)
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [70]:
# foreach action best example
from __future__ import print_function
x = sc.parallelize([1,2,3,2,1])
def f(el): 
    '''side effect: append the current RDD elements to a file'''
    f1=open("Share/foreachExample.txt", 'a+') 
    print(el,file=f1)

open('Share/foreachExample.txt', 'w').close()  # first clear the file contents

y = x.foreach(f) # writes into foreachExample.txt

print(x.collect())
print(y) # foreach returns 'None'
# print the contents of foreachExample.txt
f1=open("Share/foreachExample.txt", 'r')
print(f1.read())

[1, 2, 3, 2, 1]
None
1
2
3
2
1



In [93]:
#again this is bullshit
def f(x):
    f=open("Share/exp1.txt",'a+')
    print([x],file=f)
x=sc.parallelize([1,2,3,4,5])
x.foreach(f)
print(x.glom().collect())

[[1, 2, 3, 4, 5]]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.foreachPartition">
<img align=left src="Share/files/images/pyspark-page21.svg" width=500 height=500 />
</a>

In [75]:
RDD.foreachPartition?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mforeachPartition[0m[0;34m([0m[0mf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Applies a function to each partition of this RDD.

>>> def f(iterator):
...     for x in iterator:
...          print(x)
>>> sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f)
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [82]:
# foreachPartition
#from __future__ import print_function
x = sc.parallelize([1,2,3,4,5,6,7,8],5)
def f(parition):
    '''side effect: append the current RDD partition contents to a file'''
    f1=open("Share/foreachPartitionExample.txt", 'a+') 
    print([el for el in parition],file=f1)

open('Share/foreachPartitionExample.txt', 'w').close()  # first clear the file contents

y = x.foreachPartition(f) # writes into foreachExample.txt

print(x.glom().collect())
print(y)  # foreach returns 'None'
# print the contents of foreachExample.txt
f1 = open("Share/foreachPartitionExample.txt", "r")
print (f1.read())

[[1], [2, 3], [4], [5, 6], [7, 8]]
None
[1]
[2, 3]
[4]
[5, 6]
[7, 8]



In [91]:
#Not BUllshit anywhere
def f(x):
    f=open("Share/exp.txt",'a+')
    print([i for i in x],file=f)
x=sc.parallelize([1,2,3,4,5],3)
x.foreachPartition(f)
print(x.glom().collect())

[[1], [2, 3], [4, 5]]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.collect">
<img align=left src="Share/files/images/pyspark-page22.svg" width=500 height=500 />
</a>

In [94]:
RDD.collect?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mcollect[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a list that contains all of the elements in this RDD.

.. note:: This method should only be used if the resulting array is expected
    to be small, as all the data is loaded into the driver's memory.
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [96]:
# collect
x = sc.parallelize([1,2,3])
y = x.collect()
print(x,y)  # distributed # not distributed

ParallelCollectionRDD[169] at parallelize at PythonRDD.scala:195 [1, 2, 3]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.reduce">
<img align=left src="Share/files/images/pyspark-page23.svg" width=500 height=500 />
</a>

In [97]:
RDD.reduce?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mreduce[0m[0;34m([0m[0mf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Reduces the elements of this RDD using the specified commutative and
associative binary operator. Currently reduces partitions locally.

>>> from operator import add
>>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add)
15
>>> sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add)
10
>>> sc.parallelize([]).reduce(add)
Traceback (most recent call last):
    ...
ValueError: Can not reduce() empty RDD
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [124]:
print(sc.parallelize([1,2,3,4,5]).reduce(add))

15


In [130]:
print(sc.parallelize((6 for i in range(10))).map(lambda x:1).reduce(add))

10


In [133]:
# reduce
x = sc.parallelize([1,2,3])
y = x.reduce(add)  # computes a cumulative sum
print(x.collect())
print(y)

[1, 2, 3]
6


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.fold">
<img align=left src="Share/files/images/pyspark-page24.svg" width=500 height=500 />
</a>

In [134]:
RDD.fold?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mfold[0m[0;34m([0m[0mzeroValue[0m[0;34m,[0m [0mop[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregate the elements of each partition, and then the results for all
the partitions, using a given associative function and a neutral "zero value."

The function C{op(t1, t2)} is allowed to modify C{t1} and return it
as its result value to avoid object allocation; however, it should not
modify C{t2}.

This behaves somewhat differently from fold operations implemented
for non-distributed collections in functional languages like Scala.
This fold operation may be applied to partitions individually, and then
fold those results into the final result, rather than apply the fold
to each element sequentially in some defined ordering. For functions
that are not commutative, the result may differ from that of a fold
applied to a non-distributed collection.

>>> from operator import add
>>> sc.parallelize([1, 2, 3, 4, 5]).fold(

In [9]:
# fold
x = sc.parallelize([1,2,3])
neutral_zero_value = 1  # 0 for sum, 1 for multiplication
y = x.fold(neutral_zero_value,lambda obj, accumulated: accumulated * obj) # computes cumulative sum
print(x.collect())
print(y)

[1, 2, 3]
6


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.aggregate">
<img align=left src="Share/files/images/pyspark-page25.svg" width=500 height=500 />
</a>

Description:

aggregate() lets you take an RDD and generate a single value that is of a different type than what was stored in the original RDD.

zeroValue: The initialization value, for your result, in the desired format.
seqOp: The operation you want to apply to RDD records. Runs once for every record in a partition.
combOp: Defines how the resulted objects (one for every partition), gets combined.

In [38]:
seqOp=(lambda x,y:(x[0]+y,x[1]+1))
combOp=(lambda x,y:(x[0]+y[0],x[1]+y[1]))
y=sc.parallelize([1,2,3,4,5],2).aggregate((0,0),seqOp,combOp)
print(y)

(15, 5)


In [41]:
# aggregate Addition and Multiplication
x = sc.parallelize([1,2,3,4],2)
neutral_zero_value = (0,1) # sum: x+0 = x, product: 1*x = x
seqOp = (lambda aggregated, el: (aggregated[0] + el, aggregated[1] * el)) 
combOp = (lambda aggregated, el: (aggregated[0] + el[0], aggregated[1] * el[1]))
y = x.aggregate(neutral_zero_value,seqOp,combOp)  # computes (cumulative sum, cumulative product)
print(x.collect())
print(y)

[1, 2, 3, 4]
(10, 24)


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.max">
<img align=left src="Share/files/images/pyspark-page26.svg" width=500 height=500 />
</a>

In [42]:
RDD.max?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mmax[0m[0;34m([0m[0mkey[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Find the maximum item in this RDD.

:param key: A function used to generate key for comparing

>>> rdd = sc.parallelize([1.0, 5.0, 43.0, 10.0])
>>> rdd.max()
43.0
>>> rdd.max(key=str)
5.0
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [57]:
# max
x = sc.parallelize([1.0,5.0,10.0])
y1=x.max() #Actual max value
y2=x.max(key=str) #convert to string and find largest string (5.0>10.0)
y = x.max(key=lambda x:-x) #-1.0 greater than all - Use max ax min
print(x.collect())
print(y1)
print(y2)
print(y)

[1.0, 5.0, 10.0]
10.0
5.0
1.0


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.min">
<img align=left src="Share/files/images/pyspark-page27.svg" width=500 height=500 />
</a>

In [75]:
RDD.min?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mmin[0m[0;34m([0m[0mkey[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Find the minimum item in this RDD.

:param key: A function used to generate key for comparing

>>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0])
>>> rdd.min()
2.0
>>> rdd.min(key=str)
10.0
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [None]:
RDD>min

In [60]:
# min
x = sc.parallelize([1,3,2])
y = x.min(key=lambda x:-x)
print(x.collect())
print(y)

[1, 3, 2]
3


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sum">
<img align=left src="Share/files/images/pyspark-page28.svg" width=500 height=500 />
</a>

In [62]:
RDD.sum?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Add up the elements in this RDD.

>>> sc.parallelize([1.0, 2.0, 3.0]).sum()
6.0
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [74]:
# sum
x = sc.parallelize([1,3,2,4,5,6],3)
y=x.sum()
print(x.glom().collect())
print(y)

[[1, 3], [2, 4], [5, 6]]
21


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.count">
<img align=left src="Share/files/images/pyspark-page29.svg" width=500 height=500 />
</a>

In [8]:
RDD.count?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mcount[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the number of elements in this RDD.

>>> sc.parallelize([2, 3, 4]).count()
3
[0;31mFile:[0m      /opt/spark/python/pyspark/rdd.py
[0;31mType:[0m      method


In [33]:
# count
x = sc.parallelize([1,3,2],3)
y = x.count()
print(x.collect())
print(y)

[1, 3, 2]
3


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.histogram">
<img align=left src="Share/files/images/pyspark-page30.svg" width=500 height=500 />
</a>

In [36]:
RDD.histogram?

[0;31mSignature:[0m [0mRDD[0m[0;34m.[0m[0mhistogram[0m[0;34m([0m[0mbuckets[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute a histogram using the provided buckets. The buckets
are all open to the right except for the last which is closed.
e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50],
which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1
and 50 we would have a histogram of 1,0,1.

If your histogram is evenly spaced (e.g. [0, 10, 20, 30]),
this can be switched from an O(log n) inseration to O(1) per
element (where n is the number of buckets).

Buckets must be sorted, not contain any duplicates, and have
at least two elements.

If `buckets` is a number, it will generate buckets which are
evenly spaced between the minimum and maximum of the RDD. For
example, if the min value is 0 and the max is 100, given `buckets`
as 2, the resulting buckets will be [0,50) [50,100]. `buckets` must
be at least 1. An exception is raised if the RD

In [45]:
# histogram (example #1)
x = sc.parallelize([1,3,1,2,3])
y = x.histogram(2)
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
([1, 2, 3], [2, 3])


In [51]:
# histogram (example #2)
x = sc.parallelize([1,3,1,2,3])
y = x.histogram([0,0.5,1,1.5,2,2.5,3,3.5])
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5], [0, 0, 2, 0, 1, 0, 2])


In [56]:
print(sc.parallelize([0,10,20,30,40,50,60]).histogram(2))

([0, 30, 60], [3, 4])


In [40]:
print(sc.parallelize(range(5)).histogram([0,5,10,15,20]))

([0, 5, 10, 15, 20], [5, 0, 0, 0])


In [29]:
print(sc.parallelize(range(51)).histogram(2))

([0, 25, 50], [25, 26])


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.mean">
<img align=left src="Share/files/images/pyspark-page31.svg" width=500 height=500 />
</a>

In [57]:
RDD.mean?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mmean[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute the mean of this RDD's elements.

>>> sc.parallelize([1, 2, 3]).mean()
2.0
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [59]:
# mean
x = sc.parallelize([1,3,2,4,5,6,7,8],3)
y = x.mean()
print(x.collect())
print(y)

[1, 3, 2, 4, 5, 6, 7, 8]
4.5


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.variance">
<img align=left src="Share/files/images/pyspark-page32.svg" width=500 height=500 />
</a>

In [60]:
RDD.variance?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mvariance[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute the variance of this RDD's elements.

>>> sc.parallelize([1, 2, 3]).variance()
0.666...
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [61]:
# variance
x = sc.parallelize([1,3,2])
y = x.variance()  # divides by N
print(x.collect())
print(y)

[1, 3, 2]
0.666666666667


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.stdev">
<img align=left src="Share/files/images/pyspark-page33.svg" width=500 height=500 />
</a>

In [62]:
RDD.stdev?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mstdev[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute the standard deviation of this RDD's elements.

>>> sc.parallelize([1, 2, 3]).stdev()
0.816...
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [63]:
# stdev
x = sc.parallelize([1,3,2])
y = x.stdev()  # divides by N
print(x.collect())
print(y)

[1, 3, 2]
0.816496580927726


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sampleStdev">
<img align=left src="Share/files/images/pyspark-page34.svg" width=500 height=500 />
</a>

In [65]:
RDD.sampleStdev?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0msampleStdev[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute the sample standard deviation of this RDD's elements (which
corrects for bias in estimating the standard deviation by dividing by
N-1 instead of N).

>>> sc.parallelize([1, 2, 3]).sampleStdev()
1.0
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [66]:
# sampleStdev
x = sc.parallelize([1,3,2])
y = x.sampleStdev() # divides by N-1
print(x.collect())
print(y)

[1, 3, 2]
1.0


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sampleVariance">
<img align=left src="Share/files/images/pyspark-page35.svg" width=500 height=500 />
</a>

In [67]:
RDD.sampleVariance?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0msampleVariance[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute the sample variance of this RDD's elements (which corrects
for bias in estimating the variance by dividing by N-1 instead of N).

>>> sc.parallelize([1, 2, 3]).sampleVariance()
1.0
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [68]:
# sampleVariance
x = sc.parallelize([1,3,2])
y = x.sampleVariance()  # divides by N-1
print(x.collect())
print(y)

[1, 3, 2]
1.0


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.countByValue">
<img align=left src="Share/files/images/pyspark-page36.svg" width=500 height=500 />
</a>

In [8]:
RDD.countByValue?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mcountByValue[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return the count of each unique value in this RDD as a dictionary of
(value, count) pairs.

>>> sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items())
[(1, 2), (2, 3)]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [18]:
# countByValue
x = sc.parallelize(["a","b","a"],2)
y = x.countByValue().items()
print(x.collect())
print(y)

['a', 'b', 'a']
[('a', 2), ('b', 1)]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.top">
<img align=left src="Share/files/images/pyspark-page37.svg" width=500 height=500 />
</a>

In [11]:
RDD.top?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mtop[0m[1;33m([0m[0mnum[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[0mNone[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Get the top N elements from an RDD.

.. note:: This method should only be used if the resulting array is expected
    to be small, as all the data is loaded into the driver's memory.

.. note:: It returns the list sorted in descending order.

>>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
[12]
>>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)
[6, 5]
>>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)
[4, 3, 2]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [28]:
# top - Sorts indescending order and Limits number of O/P
x = sc.parallelize(["a","b","c"])
y = x.top(num = 2)
print(x.collect())
print(y)

['a', 'b', 'c']
['c', 'b']


In [29]:
x = sc.parallelize([1,2,10,4,12,3])
y = x.top(num = 3, key=str)
print(x.collect())
print(y)

[1, 2, 10, 4, 12, 3]
[4, 3, 2]


In [30]:
x = sc.parallelize([1,2,10,4,12,3])
y = x.top(num = 3, key=lambda x:-x)
print(x.collect())
print(y)

[1, 2, 10, 4, 12, 3]
[1, 2, 3]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.takeOrdered">
<img align=left src="Share/files/images/pyspark-page38.svg" width=500 height=500 />
</a>

In [24]:
RDD.takeOrdered?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mtakeOrdered[0m[1;33m([0m[0mnum[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[0mNone[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Get the N elements from an RDD ordered in ascending order or as
specified by the optional key function.

.. note:: this method should only be used if the resulting array is expected
    to be small, as all the data is loaded into the driver's memory.

>>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)
[1, 2, 3, 4, 5, 6]
>>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x)
[10, 9, 7, 6, 5, 4]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [31]:
# takeOrdered
x = sc.parallelize([1,3,1,2,3])
y = x.takeOrdered(num = 3)
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
[1, 1, 2]


In [26]:
x = sc.parallelize([1,3,1,2,3])
y = x.takeOrdered(num = 3,key=lambda x:-x)
print(x.collect())
print(y)

[1, 3, 1, 2, 3]
[3, 3, 2]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.take">
<img align=left src="Share/files/images/pyspark-page39.svg" width=500 height=500 />
</a>

In [32]:
RDD.take?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mtake[0m[1;33m([0m[0mnum[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Take the first num elements of the RDD.

It works by first scanning one partition, and use the results from
that partition to estimate the number of additional partitions needed
to satisfy the limit.

Translated from the Scala implementation in RDD#take().

.. note:: this method should only be used if the resulting array is expected
    to be small, as all the data is loaded into the driver's memory.

>>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)
[2, 3]
>>> sc.parallelize([2, 3, 4, 5, 6]).take(10)
[2, 3, 4, 5, 6]
>>> sc.parallelize(range(100), 100).filter(lambda x: x > 90).take(3)
[91, 92, 93]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [39]:
# take
x = sc.parallelize([1,3,1,2,3,45,67,33],3).cache()
y = x.take(num = 3)
print(x.glom().collect())
print(y)

[[1, 3], [1, 2], [3, 45, 67, 33]]
[1, 3, 1]


In [46]:
x=sc.parallelize(range(100),50)
y=x.filter(lambda x:x>90).take(3)
print(y)

[91, 92, 93]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.first">
<img align=left src="Share/files/images/pyspark-page40.svg" width=500 height=500 />
</a>

In [47]:
RDD.first?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mfirst[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return the first element in this RDD.

>>> sc.parallelize([2, 3, 4]).first()
2
>>> sc.parallelize([]).first()
Traceback (most recent call last):
    ...
ValueError: RDD is empty
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [None]:
# first
x = sc.parallelize([1,3,1,2,3])
y = x.first()
print(x.collect())
print(y)

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.collectAsMap">
<img align=left src="Share/files/images/pyspark-page41.svg" width=500 height=500 />
</a>

In [48]:
RDD.collectAsMap?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mcollectAsMap[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return the key-value pairs in this RDD to the master as a dictionary.

.. note:: this method should only be used if the resulting data is expected
    to be small, as all the data is loaded into the driver's memory.

>>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()
>>> m[1]
2
>>> m[3]
4
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [28]:
# collectAsMap
x = sc.parallelize([('C',3),('A',1),('B',2)])
y = x.collectAsMap()
print(x.collect())
print(sorted(y))

[('C', 3), ('A', 1), ('B', 2)]
['A', 'B', 'C']


In [24]:
# collectAsMap
x = sc.parallelize([(1,2),(3,4)])
y = x.collectAsMap()
print(y[1])
print(y[3])

2
4


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.keys">
<img align=left src="Share/files/images/pyspark-page42.svg" width=500 height=500 />
</a>

In [25]:
RDD.keys?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mkeys[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return an RDD with the keys of each tuple.

>>> m = sc.parallelize([(1, 2), (3, 4)]).keys()
>>> m.collect()
[1, 3]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [29]:
# keys
x = sc.parallelize([('C',3),('A',1),('B',2)])
y = x.keys()
print(type(y))
print(x.collect())
print(sorted(y.collect()))

<class 'pyspark.rdd.PipelinedRDD'>
[('C', 3), ('A', 1), ('B', 2)]
['A', 'B', 'C']


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.values">
<img align=left src="Share/files/images/pyspark-page43.svg" width=500 height=500 />
</a>

In [30]:
RDD.values?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mvalues[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return an RDD with the values of each tuple.

>>> m = sc.parallelize([(1, 2), (3, 4)]).values()
>>> m.collect()
[2, 4]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [9]:
# values
x = sc.parallelize([('C',3),('A',1),('B',2)])
y = x.values()
print(x.collect())
print(y.collect())

[('C', 3), ('A', 1), ('B', 2)]
[3, 1, 2]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.reduceByKey">
<img align=left src="Share/files/images/pyspark-page44.svg" width=500 height=500 />
</a>

In [11]:
RDD.reduceByKey?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mreduceByKey[0m[1;33m([0m[0mfunc[0m[1;33m,[0m [0mnumPartitions[0m[1;33m=[0m[0mNone[0m[1;33m,[0m [0mpartitionFunc[0m[1;33m=[0m[1;33m<[0m[0mfunction[0m [0mportable_hash[0m [0mat[0m [1;36m0x00000000068E3438[0m[1;33m>[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Merge the values for each key using an associative and commutative reduce function.

This will also perform the merging locally on each mapper before
sending results to a reducer, similarly to a "combiner" in MapReduce.

Output will be partitioned with C{numPartitions} partitions, or
the default parallelism level if C{numPartitions} is not specified.
Default partitioner is hash-partition.

>>> from operator import add
>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
>>> sorted(rdd.reduceByKey(add).collect())
[('a', 2), ('b', 1)]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [10]:
# reduceByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
y = x.reduceByKey(lambda agg, obj: agg + obj)
print(x.collect())
print(y.collect())

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
[('A', 12), ('B', 3)]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.reduceByKeyLocally">
<img align=left src="Share/files/images/pyspark-page45.svg" width=500 height=500 />
</a>

In [12]:
RDD.reduceByKeyLocally?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mreduceByKeyLocally[0m[1;33m([0m[0mfunc[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Merge the values for each key using an associative and commutative reduce function, but
return the results immediately to the master as a dictionary.

This will also perform the merging locally on each mapper before
sending results to a reducer, similarly to a "combiner" in MapReduce.

>>> from operator import add
>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
>>> sorted(rdd.reduceByKeyLocally(add).items())
[('a', 2), ('b', 1)]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [24]:
# reduceByKeyLocally
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
y = x.reduceByKeyLocally(lambda agg, obj: agg + obj)
print(x.collect())
print(y)
print(type(y['A']))
print(y.items()[0])

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
{'A': 12, 'B': 3}
<type 'int'>
('A', 12)


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.countByKey">
<img align=left src="Share/files/images/pyspark-page46.svg" width=500 height=500 />
</a>

In [34]:
RDD.countByValue?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mcountByValue[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return the count of each unique value in this RDD as a dictionary of
(value, count) pairs.

>>> sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items())
[(1, 2), (2, 3)]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [36]:
# countByKey
x = sc.parallelize([('B',1),('B',2),('A',1),('A',2),('A',5)])
y = x.countByValue()
print(x.collect())
print(y.items())

[('B', 1), ('B', 2), ('A', 1), ('A', 2), ('A', 5)]
[(('A', 1), 1), (('B', 2), 1), (('B', 1), 1), (('A', 5), 1), (('A', 2), 1)]


<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.join">
<img align=left src="Share/files/images/pyspark-page47.svg" width=500 height=500 />
</a>

In [33]:
RDD.join?

[1;31mSignature:[0m [0mRDD[0m[1;33m.[0m[0mjoin[0m[1;33m([0m[0mother[0m[1;33m,[0m [0mnumPartitions[0m[1;33m=[0m[0mNone[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return an RDD containing all pairs of elements with matching keys in
C{self} and C{other}.

Each pair of elements will be returned as a (k, (v1, v2)) tuple, where
(k, v1) is in C{self} and (k, v2) is in C{other}.

Performs a hash join across the cluster.

>>> x = sc.parallelize([("a", 1), ("b", 4)])
>>> y = sc.parallelize([("a", 2), ("a", 3)])
>>> sorted(x.join(y).collect())
[('a', (1, 2)), ('a', (1, 3))]
[1;31mFile:[0m      c:\pyspark\python\pyspark\rdd.py
[1;31mType:[0m      instancemethod


In [None]:
# join
x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
z = x.join(y)
print(x.collect())
print(y.collect())
print(z.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.leftOuterJoin">
<img align=left src="Share/files/images/pyspark-page48.svg" width=500 height=500 />
</a>

In [None]:
# leftOuterJoin
x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
z = x.leftOuterJoin(y)
print(x.collect())
print(y.collect())
print(z.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.rightOuterJoin">
<img align=left src="Share/files/images/pyspark-page49.svg" width=500 height=500 />
</a>

In [None]:
# rightOuterJoin
x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])
z = x.rightOuterJoin(y)
print(x.collect())
print(y.collect())
print(z.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.partitionBy">
<img align=left src="Share/files/images/pyspark-page50.svg" width=500 height=500 />
</a>

In [None]:
# partitionBy
x = sc.parallelize([(0,1),(1,2),(2,3)],2)
y = x.partitionBy(numPartitions = 3, partitionFunc = lambda x: x)  # only key is passed to paritionFunc
print(x.glom().collect())
print(y.glom().collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.combineByKey">
<img align=left src="Share/files/images/pyspark-page51.svg" width=500 height=500 />
</a>

In [None]:
# combineByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
createCombiner = (lambda el: [(el,el**2)]) 
mergeVal = (lambda aggregated, el: aggregated + [(el,el**2)]) # append to aggregated
mergeComb = (lambda agg1,agg2: agg1 + agg2 )  # append agg1 with agg2
y = x.combineByKey(createCombiner,mergeVal,mergeComb)
print(x.collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.aggregateByKey">
<img align=left src="Share/files/images/pyspark-page52.svg" width=500 height=500 />
</a>

In [None]:
# aggregateByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
zeroValue = [] # empty list is 'zero value' for append operation
mergeVal = (lambda aggregated, el: aggregated + [(el,el**2)])
mergeComb = (lambda agg1,agg2: agg1 + agg2 )
y = x.aggregateByKey(zeroValue,mergeVal,mergeComb)
print(x.collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.foldByKey">
<img align=left src="Share/files/images/pyspark-page53.svg" width=500 height=500 />
</a>

In [None]:
# foldByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
zeroValue = 1 # one is 'zero value' for multiplication
y = x.foldByKey(zeroValue,lambda agg,x: agg*x )  # computes cumulative product within each key
print(x.collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.groupByKey">
<img align=left src="Share/files/images/pyspark-page54.svg" width=500 height=500 />
</a>

In [None]:
# groupByKey
x = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])
y = x.groupByKey()
print(x.collect())
print([(j[0],[i for i in j[1]]) for j in y.collect()])

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.flatMapValues">
<img align=left src="Share/files/images/pyspark-page55.svg" width=500 height=500 />
</a>

In [None]:
# flatMapValues
x = sc.parallelize([('A',(1,2,3)),('B',(4,5))])
y = x.flatMapValues(lambda x: [i**2 for i in x]) # function is applied to entire value, then result is flattened
print(x.collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.mapValues">
<img align=left src="Share/files/images/pyspark-page56.svg" width=500 height=500 />
</a>

In [None]:
# mapValues
x = sc.parallelize([('A',(1,2,3)),('B',(4,5))])
y = x.mapValues(lambda x: [i**2 for i in x]) # function is applied to entire value
print(x.collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.groupWith">
<img align=left src="Share/files/images/pyspark-page57.svg" width=500 height=500 />
</a>

In [None]:
# groupWith
x = sc.parallelize([('C',4),('B',(3,3)),('A',2),('A',(1,1))])
y = sc.parallelize([('B',(7,7)),('A',6),('D',(5,5))])
z = sc.parallelize([('D',9),('B',(8,8))])
a = x.groupWith(y,z)
print(x.collect())
print(y.collect())
print(z.collect())
print("Result:")
for key,val in list(a.collect()): 
    print(key, [list(i) for i in val])

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.cogroup">
<img align=left src="Share/files/images/pyspark-page58.svg" width=500 height=500 />
</a>

In [None]:
# cogroup
x = sc.parallelize([('C',4),('B',(3,3)),('A',2),('A',(1,1))])
y = sc.parallelize([('A',8),('B',7),('A',6),('D',(5,5))])
z = x.cogroup(y)
print(x.collect())
print(y.collect())
for key,val in list(z.collect()):
    print(key, [list(i) for i in val])

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.sampleByKey">
<img align=left src="Share/files/images/pyspark-page59.svg" width=500 height=500 />
</a>

In [None]:
# sampleByKey
x = sc.parallelize([('A',1),('B',2),('C',3),('B',4),('A',5)])
y = x.sampleByKey(withReplacement=False, fractions={'A':0.5, 'B':1, 'C':0.2})
print(x.collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.subtractByKey">
<img align=left src="Share/files/images/pyspark-page60.svg" width=500 height=500 />
</a>

In [None]:
# subtractByKey
x = sc.parallelize([('C',1),('B',2),('A',3),('A',4)])
y = sc.parallelize([('A',5),('D',6),('A',7),('D',8)])
z = x.subtractByKey(y)
print(x.collect())
print(y.collect())
print(z.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.subtract">
<img align=left src="Share/files/images/pyspark-page61.svg" width=500 height=500 />
</a>

In [None]:
# subtract
x = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])
y = sc.parallelize([('C',8),('A',2),('D',1)])
z = x.subtract(y)
print(x.collect())
print(y.collect())
print(z.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.keyBy">
<img align=left src="Share/files/images/pyspark-page62.svg" width=500 height=500 />
</a>

In [None]:
# keyBy
x = sc.parallelize([1,2,3])
y = x.keyBy(lambda x: x**2)
print(x.collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.repartition">
<img align=left src="Share/files/images/pyspark-page63.svg" width=500 height=500 />
</a>

In [None]:
# repartition
x = sc.parallelize([1,2,3,4,5],2)
y = x.repartition(numPartitions=3)
print(x.glom().collect())
print(y.glom().collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.coalesce">
<img align=left src="Share/files/images/pyspark-page64.svg" width=500 height=500 />
</a>

In [None]:
# coalesce
x = sc.parallelize([1,2,3,4,5],2)
y = x.coalesce(numPartitions=1)
print(x.glom().collect())
print(y.glom().collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.zip">
<img align=left src="Share/files/images/pyspark-page65.svg" width=500 height=500 />
</a>

In [None]:
# zip
x = sc.parallelize(['B','A','A'])
y = x.map(lambda x: ord(x))  # zip expects x and y to have same #partitions and #elements/partition
z = x.zip(y)
print(x.collect())
print(y.collect())
print(z.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.zipWithIndex">
<img align=left src="Share/files/images/pyspark-page66.svg" width=500 height=500 />
</a>

In [None]:
# zipWithIndex
x = sc.parallelize(['B','A','A'],2)
y = x.zipWithIndex()
print(x.glom().collect())
print(y.collect())

<a href="http://spark.apache.org/docs/1.2.0/api/python/pyspark.html#pyspark.RDD.zipWithUniqueId">
<img align=left src="Share/files/images/pyspark-page67.svg" width=500 height=500 />
</a>

In [None]:
# zipWithUniqueId
x = sc.parallelize(['B','A','A'],2)
y = x.zipWithUniqueId()
print(x.glom().collect())
print(y.collect())

In [31]:
import os

In [32]:
os.getcwd()

'D:\\Windows\\Documents\\GitHub\\Hadoop'