# Spark RDD Transformations

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

# Try not to use the following transformations

It will cause a huge schuffel of all of the data items with the same key to one of cluster nodes and cause some cluster problems when data is large. 

**groupByKey([numPartitions])**

when called on a dataset of **(K, V)** pairs, returns a dataset of **(K, Seq[V])** pairs

In [2]:
rdd2 = sc.parallelize([('the', 1), ('the', 1), ('the', 2), ('for', 1), ('for', 2),('for', 1)])

rdd2.groupByKey().collect()


[('the', <pyspark.resultiterable.ResultIterable at 0x2e17ff0f4c0>),
 ('for', <pyspark.resultiterable.ResultIterable at 0x2e17ff0eda0>)]

In [3]:
rdd2.groupByKey().map(lambda x: sum(x[1])).collect()


[4, 4]

**sortByKey([ascending], [numPartitions])**

When called on a dataset of **(K, V)** pairs where K implements Ordered, returns a dataset of **(K, V)** pairs sorted by keys in ascending or descending order, as specified in the boolean ascending argument

In [4]:
# We create here an example text data. 
a = [('What Will It Take for BU Commuters to Leave Their Cars for the MBTA? University\
      boosts T pass subsidies to cover half the cost, raises parking fees, all part of \
      broader strategy to build a greener BU')]
rdd = sc.parallelize(a) 
counts = rdd.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1))



In [5]:
counts.collect()

[('What', 1),
 ('Will', 1),
 ('It', 1),
 ('Take', 1),
 ('for', 1),
 ('BU', 1),
 ('Commuters', 1),
 ('to', 1),
 ('Leave', 1),
 ('Their', 1),
 ('Cars', 1),
 ('for', 1),
 ('the', 1),
 ('MBTA?', 1),
 ('University', 1),
 ('', 1),
 ('', 1),
 ('', 1),
 ('', 1),
 ('', 1),
 ('boosts', 1),
 ('T', 1),
 ('pass', 1),
 ('subsidies', 1),
 ('to', 1),
 ('cover', 1),
 ('half', 1),
 ('the', 1),
 ('cost,', 1),
 ('raises', 1),
 ('parking', 1),
 ('fees,', 1),
 ('all', 1),
 ('part', 1),
 ('of', 1),
 ('', 1),
 ('', 1),
 ('', 1),
 ('', 1),
 ('', 1),
 ('', 1),
 ('broader', 1),
 ('strategy', 1),
 ('to', 1),
 ('build', 1),
 ('a', 1),
 ('greener', 1),
 ('BU', 1)]