# RDD - Basics

RDD Creation

RDD can be created in 2 ways:
1. Parallelizing an existing collection.
2. Loading a dataset from an external storage system.

In [2]:
from pyspark.sql import SparkSession

In [39]:
# sc.setLogLevel("ERROR")
spark = SparkSession.builder.master("local[*]").appName("RDD_example1").getOrCreate()

In [4]:
sc

''

In [5]:
spark

In [6]:
sc = spark.sparkContext

In [7]:
type(spark.sparkContext)

pyspark.context.SparkContext

In [8]:
type(spark)

pyspark.sql.session.SparkSession

In [9]:
sc

# Parallelize

In [10]:
list_data = list(range(20))

In [11]:
list_data

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [12]:
rdd_list = sc.parallelize(list_data)

In [13]:
rdd_list

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [14]:
type(rdd_list)

pyspark.rdd.RDD

In [19]:
rdd_list.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [20]:
type(rdd_list.collect())

list

Parallelize -> Distribute single node list to various partitions (cores and nodes)
Collect -> Opposite of distributing among partiotions. It gathers the data from different partitions into a list

# File import

In [15]:
#File that needs to be imported: /sparkdata/word_count_data.txt
rdd_from_file = spark.sparkContext.textFile("/sparkdata/word_count_data.txt")

In [18]:
type(rdd_from_file.collect())

list

# Partitioning

In [23]:
rdd_list.getNumPartitions() #I have 2 cores assigned to this linux virtual box

2

In [24]:
rdd_from_file.getNumPartitions() #I have 2 cores assigned to this linux virtual box

2

In [25]:
rdd_list2 = sc.parallelize(list_data,4)

In [26]:
rdd_list2.getNumPartitions()

4

In [27]:
rdd_from_file2 = spark.sparkContext.textFile("/sparkdata/word_count_data.txt",5)

In [28]:
rdd_from_file2.getNumPartitions()

5

Repartition and Coalesce

Changing number of partitions of an existing RDD

In [33]:
repartitionedRDD = rdd_list.repartition(10) #full shuffle of data to achieve even distribution

In [35]:
repartitionedRDD = rdd_list.coalesce(3) #avoids shuffle, bundles data into reduced partition | skewed data distribution

# RDD Operations

![image.png](attachment:image.png)

# Transformations

![image.png](attachment:image.png)

![image.png](attachment:image.png)

# Word Count Problem

In [73]:
from pyspark.sql import SparkSession

In [78]:
spark = SparkSession.builder.appName("word_count").getOrCreate()

In [79]:
dataRDD = spark.sparkContext.textFile("/sparkdata/word_count_data2.txt")

In [80]:
dataRDD.collect()

[Stage 30:>                                                         (0 + 2) / 2]                                                                                

['Now the way that the book winds up is this: Tom and me found the money',
 'that the robbers hid in the cave, and it made us rich. We got six',
 'thousand dollars apiece--all gold. It was an awful sight of money when',
 'it was piled up. Well, Judge Thatcher he took it and put it out at',
 'interest, and it fetched us a dollar a day apiece all the year round--',
 'more than a body could tell what to do with. The Widow Douglas she took',
 'me for her son, and allowed she would sivilize me; but it was rough',
 'living in the house all the time, considering how dismal regular and',
 "decent the widow was in all her ways; and so when I couldn't stand it no",
 'longer I lit out. I got into my old rags and my sugar-hogshead again,',
 'and was free and satisfied. But Tom Sawyer he hunted me up and said he',
 'was going to start a band of robbers, and I might join if I would go back',
 'to the widow and be respectable. So I went back.']

In [82]:
dataRDD.map(lambda line: line.split(" ")).collect() # using map would result in list of lists

[['Now',
  'the',
  'way',
  'that',
  'the',
  'book',
  'winds',
  'up',
  'is',
  'this:',
  'Tom',
  'and',
  'me',
  'found',
  'the',
  'money'],
 ['that',
  'the',
  'robbers',
  'hid',
  'in',
  'the',
  'cave,',
  'and',
  'it',
  'made',
  'us',
  'rich.',
  'We',
  'got',
  'six'],
 ['thousand',
  'dollars',
  'apiece--all',
  'gold.',
  'It',
  'was',
  'an',
  'awful',
  'sight',
  'of',
  'money',
  'when'],
 ['it',
  'was',
  'piled',
  'up.',
  'Well,',
  'Judge',
  'Thatcher',
  'he',
  'took',
  'it',
  'and',
  'put',
  'it',
  'out',
  'at'],
 ['interest,',
  'and',
  'it',
  'fetched',
  'us',
  'a',
  'dollar',
  'a',
  'day',
  'apiece',
  'all',
  'the',
  'year',
  'round--'],
 ['more',
  'than',
  'a',
  'body',
  'could',
  'tell',
  'what',
  'to',
  'do',
  'with.',
  'The',
  'Widow',
  'Douglas',
  'she',
  'took'],
 ['me',
  'for',
  'her',
  'son,',
  'and',
  'allowed',
  'she',
  'would',
  'sivilize',
  'me;',
  'but',
  'it',
  'was',
  'rough'],
 [

In [88]:
#Use flatMap to get a single list
flatRDD = dataRDD.flatMap(lambda element: element.split()).flatMap(lambda element: element.split("--"))
#split out any special character on top of normal split

In [89]:
flatRDD.collect()

['Now',
 'the',
 'way',
 'that',
 'the',
 'book',
 'winds',
 'up',
 'is',
 'this:',
 'Tom',
 'and',
 'me',
 'found',
 'the',
 'money',
 'that',
 'the',
 'robbers',
 'hid',
 'in',
 'the',
 'cave,',
 'and',
 'it',
 'made',
 'us',
 'rich.',
 'We',
 'got',
 'six',
 'thousand',
 'dollars',
 'apiece',
 'all',
 'gold.',
 'It',
 'was',
 'an',
 'awful',
 'sight',
 'of',
 'money',
 'when',
 'it',
 'was',
 'piled',
 'up.',
 'Well,',
 'Judge',
 'Thatcher',
 'he',
 'took',
 'it',
 'and',
 'put',
 'it',
 'out',
 'at',
 'interest,',
 'and',
 'it',
 'fetched',
 'us',
 'a',
 'dollar',
 'a',
 'day',
 'apiece',
 'all',
 'the',
 'year',
 'round',
 '',
 'more',
 'than',
 'a',
 'body',
 'could',
 'tell',
 'what',
 'to',
 'do',
 'with.',
 'The',
 'Widow',
 'Douglas',
 'she',
 'took',
 'me',
 'for',
 'her',
 'son,',
 'and',
 'allowed',
 'she',
 'would',
 'sivilize',
 'me;',
 'but',
 'it',
 'was',
 'rough',
 'living',
 'in',
 'the',
 'house',
 'all',
 'the',
 'time,',
 'considering',
 'how',
 'dismal',
 'regul

In [93]:
mappedRDD = flatRDD.map(lambda word: (word,1))

In [94]:
mappedRDD.collect()

[('Now', 1),
 ('the', 1),
 ('way', 1),
 ('that', 1),
 ('the', 1),
 ('book', 1),
 ('winds', 1),
 ('up', 1),
 ('is', 1),
 ('this:', 1),
 ('Tom', 1),
 ('and', 1),
 ('me', 1),
 ('found', 1),
 ('the', 1),
 ('money', 1),
 ('that', 1),
 ('the', 1),
 ('robbers', 1),
 ('hid', 1),
 ('in', 1),
 ('the', 1),
 ('cave,', 1),
 ('and', 1),
 ('it', 1),
 ('made', 1),
 ('us', 1),
 ('rich.', 1),
 ('We', 1),
 ('got', 1),
 ('six', 1),
 ('thousand', 1),
 ('dollars', 1),
 ('apiece', 1),
 ('all', 1),
 ('gold.', 1),
 ('It', 1),
 ('was', 1),
 ('an', 1),
 ('awful', 1),
 ('sight', 1),
 ('of', 1),
 ('money', 1),
 ('when', 1),
 ('it', 1),
 ('was', 1),
 ('piled', 1),
 ('up.', 1),
 ('Well,', 1),
 ('Judge', 1),
 ('Thatcher', 1),
 ('he', 1),
 ('took', 1),
 ('it', 1),
 ('and', 1),
 ('put', 1),
 ('it', 1),
 ('out', 1),
 ('at', 1),
 ('interest,', 1),
 ('and', 1),
 ('it', 1),
 ('fetched', 1),
 ('us', 1),
 ('a', 1),
 ('dollar', 1),
 ('a', 1),
 ('day', 1),
 ('apiece', 1),
 ('all', 1),
 ('the', 1),
 ('year', 1),
 ('round', 1),


In [98]:
reducedRDD = mappedRDD.reduceByKey(lambda val1,val2: val1+val2)

In [99]:
reducedRDD.collect()

[('Now', 1),
 ('way', 1),
 ('book', 1),
 ('is', 1),
 ('Tom', 2),
 ('money', 2),
 ('in', 3),
 ('us', 2),
 ('got', 2),
 ('thousand', 1),
 ('It', 1),
 ('was', 6),
 ('an', 1),
 ('of', 2),
 ('when', 2),
 ('he', 3),
 ('took', 2),
 ('put', 1),
 ('out', 1),
 ('at', 1),
 ('interest,', 1),
 ('fetched', 1),
 ('year', 1),
 ('round', 1),
 ('', 1),
 ('more', 1),
 ('than', 1),
 ('tell', 1),
 ('do', 1),
 ('The', 1),
 ('Widow', 1),
 ('her', 2),
 ('would', 2),
 ('sivilize', 1),
 ('but', 1),
 ('living', 1),
 ('house', 1),
 ('dismal', 1),
 ('regular', 1),
 ('decent', 1),
 ("couldn't", 1),
 ('stand', 1),
 ('no', 1),
 ('longer', 1),
 ('lit', 1),
 ('out.', 1),
 ('into', 1),
 ('rags', 1),
 ('sugar-hogshead', 1),
 ('again,', 1),
 ('free', 1),
 ('But', 1),
 ('hunted', 1),
 ('said', 1),
 ('start', 1),
 ('band', 1),
 ('join', 1),
 ('go', 1),
 ('respectable.', 1),
 ('the', 10),
 ('that', 2),
 ('winds', 1),
 ('up', 2),
 ('this:', 1),
 ('and', 13),
 ('me', 3),
 ('found', 1),
 ('robbers', 1),
 ('hid', 1),
 ('cave,', 

In [104]:
#sort based on occurence
sortedRDD = reducedRDD.map(lambda tup: (tup[1],tup[0])).sortByKey(ascending=False)

In [105]:
sortedRDD.collect()

[(13, 'and'),
 (10, 'the'),
 (7, 'it'),
 (6, 'was'),
 (6, 'I'),
 (4, 'all'),
 (4, 'a'),
 (3, 'in'),
 (3, 'he'),
 (3, 'me'),
 (3, 'to'),
 (2, 'Tom'),
 (2, 'money'),
 (2, 'us'),
 (2, 'got'),
 (2, 'of'),
 (2, 'when'),
 (2, 'took'),
 (2, 'her'),
 (2, 'would'),
 (2, 'that'),
 (2, 'up'),
 (2, 'apiece'),
 (2, 'she'),
 (2, 'widow'),
 (2, 'my'),
 (1, 'Now'),
 (1, 'way'),
 (1, 'book'),
 (1, 'is'),
 (1, 'thousand'),
 (1, 'It'),
 (1, 'an'),
 (1, 'put'),
 (1, 'out'),
 (1, 'at'),
 (1, 'interest,'),
 (1, 'fetched'),
 (1, 'year'),
 (1, 'round'),
 (1, ''),
 (1, 'more'),
 (1, 'than'),
 (1, 'tell'),
 (1, 'do'),
 (1, 'The'),
 (1, 'Widow'),
 (1, 'sivilize'),
 (1, 'but'),
 (1, 'living'),
 (1, 'house'),
 (1, 'dismal'),
 (1, 'regular'),
 (1, 'decent'),
 (1, "couldn't"),
 (1, 'stand'),
 (1, 'no'),
 (1, 'longer'),
 (1, 'lit'),
 (1, 'out.'),
 (1, 'into'),
 (1, 'rags'),
 (1, 'sugar-hogshead'),
 (1, 'again,'),
 (1, 'free'),
 (1, 'But'),
 (1, 'hunted'),
 (1, 'said'),
 (1, 'start'),
 (1, 'band'),
 (1, 'join'),
 (1, 

In [110]:
#filter words that start with 'a'
aFilteredRDD = sortedRDD.filter(lambda tup: 'a' in tup[1])

In [111]:
aFilteredRDD.collect()

[(13, 'and'),
 (6, 'was'),
 (4, 'all'),
 (4, 'a'),
 (2, 'that'),
 (2, 'apiece'),
 (1, 'way'),
 (1, 'thousand'),
 (1, 'an'),
 (1, 'at'),
 (1, 'year'),
 (1, 'than'),
 (1, 'dismal'),
 (1, 'regular'),
 (1, 'stand'),
 (1, 'rags'),
 (1, 'sugar-hogshead'),
 (1, 'again,'),
 (1, 'said'),
 (1, 'start'),
 (1, 'band'),
 (1, 'respectable.'),
 (1, 'cave,'),
 (1, 'made'),
 (1, 'dollars'),
 (1, 'awful'),
 (1, 'Thatcher'),
 (1, 'dollar'),
 (1, 'day'),
 (1, 'what'),
 (1, 'Douglas'),
 (1, 'allowed'),
 (1, 'ways;'),
 (1, 'satisfied.'),
 (1, 'Sawyer'),
 (1, 'back'),
 (1, 'back.')]

In [116]:
#Which is the most frequently occuring word?
mostFreqWord = sortedRDD.collect()[0][1]

In [117]:
mostFreqWord

'and'

# Actions

In [118]:
actionRDD = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

In [119]:
actionRDD.count()

9

In [175]:
spark.sparkContext.parallelize([1,2,3,1,4,6,5,3,2,1,4,8,9,9,7,8]).countByValue()

defaultdict(int, {1: 3, 2: 2, 3: 2, 4: 2, 6: 1, 5: 1, 8: 2, 9: 2, 7: 1})

In [176]:
spark.sparkContext.parallelize("Tamaghna Banerjee").countByValue()

defaultdict(int,
            {'T': 1,
             'a': 4,
             'm': 1,
             'g': 1,
             'h': 1,
             'n': 2,
             ' ': 1,
             'B': 1,
             'e': 3,
             'r': 1,
             'j': 1})

In [134]:
actionRDD.first()

1

In [127]:
actionRDD.max()

9

In [128]:
actionRDD.min()

1

In [123]:
actionRDD.take(5)

[1, 2, 3, 4, 5]

In [130]:
actionRDD.top(6)

[9, 8, 7, 6, 5, 4]

In [124]:
actionRDD.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [132]:
actionRDD.saveAsTextFile("/sparkdata/actionRDD2.txt")

[Stage 104:>                                                        (0 + 2) / 2]                                                                                

In [137]:
actionRDD.reduce(lambda val1,val2: val1 + val2) #iterative sum of elements in the RDD

45

In [138]:
actionRDD.reduce(lambda val1,val2: val1 * val2) #iterative multiplication of elements in the RDD

362880

In [151]:
actionRDD.fold(0,lambda t1,t2: t1+t2)

45

In [140]:
actionRDD.fold(0,lambda t1,t2: t1*t2)

0

In [144]:
spark.sparkContext.parallelize([2,3,4]).fold(1,lambda t1,t2: t1+t2)

12

In [148]:
actionRDD.glom().collect()

[[1, 2, 3, 4], [5, 6, 7, 8, 9]]

In [152]:
actionRDD.aggregate(0,lambda t1,t2: t1+t2, lambda t1,t2: t1+t2)

45

In [171]:
actionRDD.aggregate(1,lambda t1,t2: t1+t2, lambda t1,t2: t1*t2)

396

# Types of RDD

# cache() and persist()

# Shared Variables - Broadcast and Accumulators

Broadcast Variables

In [180]:
data = [\
        ("jon","snow","USA","AK"),\
        ("arya","stark","USA","WA"),\
        ("danny","targ","USA","CA"),\
        ("cersei","lann","USA","FL")\
       ]

In [183]:
states = {"AK":"Alaska","CA":"California","FL":"Florida","WA":"Washington"}

In [184]:
dataRDD = spark.sparkContext.parallelize(data)

In [185]:
dataRDD.collect()

[('jon', 'snow', 'USA', 'AK'),
 ('arya', 'stark', 'USA', 'WA'),
 ('danny', 'targ', 'USA', 'CA'),
 ('cersei', 'lann', 'USA', 'FL')]

In [189]:
#create broadcast variable
broadcast_variable = spark.sparkContext.broadcast(states)

In [191]:
type(broadcast_variable)

pyspark.broadcast.Broadcast

In [199]:
broadcast_variable.value

{'AK': 'Alaska', 'CA': 'California', 'FL': 'Florida', 'WA': 'Washington'}

In [None]:
type(broadcast_variable.value)

In [206]:
dataRDD.map(lambda tup: (tup[0],tup[1],tup[2],broadcast_variable.value[tup[3]]))

[('jon', 'snow', 'USA', 'Alaska'),
 ('arya', 'stark', 'USA', 'Washington'),
 ('danny', 'targ', 'USA', 'California'),
 ('cersei', 'lann', 'USA', 'Florida')]

Accumulators

In [207]:
accumulator_variable = spark.sparkContext.accumulator(0)

In [208]:
accumulator_variable

Accumulator<id=0, value=0>

In [209]:
type(accumulator_variable)

pyspark.accumulators.Accumulator

In [210]:
dataRDD = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

In [212]:
def counter(num):
    global accumulator_variable
    accumulator_variable += num

In [213]:
dataRDD.foreach(lambda item: counter(item))

                                                                                

In [214]:
accumulator_variable.value

45