In [1]:
#https://spark.apache.org/docs/1.1.1/api/python/pyspark.rdd.RDD-class.html


import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('SparkApp').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)


In [2]:
#Parallelize  RDD
words = sc.parallelize(["Big Data","Data Science","Intro to Web","Web Engineering","Network Theory","Machine Learning"])
num_list = [1,2,3,4,5]
numbers = sc.parallelize(num_list)

#TRANSFORMATIONS

#1.filter
words_filter = words.filter(lambda x: "Data" in x)
words_filter.collect()

['Big Data', 'Data Science']

In [3]:
number_filter = numbers.filter(lambda x: x % 2 == 0)
number_filter.collect()

[2, 4]

In [4]:
#2.map
words_map = words.map(lambda x: (x, "a"))
words_map.collect()

[('Big Data', 'a'),
 ('Data Science', 'a'),
 ('Intro to Web', 'a'),
 ('Web Engineering', 'a'),
 ('Network Theory', 'a'),
 ('Machine Learning', 'a')]

In [5]:
number_map = numbers.map(lambda x: (10 - x))
number_map.collect()


[9, 8, 7, 6, 5]

In [6]:
#using reduce as an action - this gives the collective result. can also be written with transformation
from operator import add
number_map.reduce(add)
#numbers.map(lambda x: (10 - x)).reduce(add)

35

In [7]:
#using fold as an action - this gives the collective result. can also be written with transformation
numbers.map(lambda x: (x * 2)).fold(1000, add)

2030

In [8]:
#3.Flatmap
mycollection = ["My name is Priya Yadav"]   #Not we have taken a list here
rdd_word = sc.parallelize(mycollection)
rdd_word.flatMap(lambda line: line.split(" ")).collect()

['My', 'name', 'is', 'Priya', 'Yadav']

In [9]:
#reading a txt file into RDD

rdd_txt = sc.textFile("alice.txt")
rdd_txt.flatMap(lambda line: line.split(" "))
rdd_txt.collect()

["                ALICE'S ADVENTURES IN WONDERLAND",
 '',
 '                          Lewis Carroll',
 '',
 '               THE MILLENNIUM FULCRUM EDITION 3.0',
 '',
 '',
 '',
 '',
 '                            CHAPTER I',
 '',
 '                      Down the Rabbit-Hole',
 '',
 '',
 '  Alice was beginning to get very tired of sitting by her sister',
 'on the bank, and of having nothing to do:  once or twice she had',
 'peeped into the book her sister was reading, but it had no',
 "pictures or conversations in it, `and what is the use of a book,'",
 "thought Alice `without pictures or conversation?'",
 '',
 '  So she was considering in her own mind (as well as she could,',
 'for the hot day made her feel very sleepy and stupid), whether',
 'the pleasure of making a daisy-chain would be worth the trouble',
 'of getting up and picking the daisies, when suddenly a White',
 'Rabbit with pink eyes ran close by her.',
 '',
 '  There was nothing so VERY remarkable in that; nor did Alice',
 '

In [10]:
words = rdd_txt.flatMap(lambda line: line.split(" ")).filter(lambda word:len(word) > 0)
words.collect()

["ALICE'S",
 'ADVENTURES',
 'IN',
 'WONDERLAND',
 'Lewis',
 'Carroll',
 'THE',
 'MILLENNIUM',
 'FULCRUM',
 'EDITION',
 '3.0',
 'CHAPTER',
 'I',
 'Down',
 'the',
 'Rabbit-Hole',
 'Alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister',
 'on',
 'the',
 'bank,',
 'and',
 'of',
 'having',
 'nothing',
 'to',
 'do:',
 'once',
 'or',
 'twice',
 'she',
 'had',
 'peeped',
 'into',
 'the',
 'book',
 'her',
 'sister',
 'was',
 'reading,',
 'but',
 'it',
 'had',
 'no',
 'pictures',
 'or',
 'conversations',
 'in',
 'it,',
 '`and',
 'what',
 'is',
 'the',
 'use',
 'of',
 'a',
 "book,'",
 'thought',
 'Alice',
 '`without',
 'pictures',
 'or',
 "conversation?'",
 'So',
 'she',
 'was',
 'considering',
 'in',
 'her',
 'own',
 'mind',
 '(as',
 'well',
 'as',
 'she',
 'could,',
 'for',
 'the',
 'hot',
 'day',
 'made',
 'her',
 'feel',
 'very',
 'sleepy',
 'and',
 'stupid),',
 'whether',
 'the',
 'pleasure',
 'of',
 'making',
 'a',
 'daisy-chain',
 'would

In [11]:
#defining key pair rdd

words.map(lambda word: (word,1)).reduceByKey(lambda x,y : x+y).collect()

[("ALICE'S", 3),
 ('ADVENTURES', 1),
 ('IN', 2),
 ('WONDERLAND', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('THE', 9),
 ('MILLENNIUM', 1),
 ('FULCRUM', 1),
 ('EDITION', 1),
 ('3.0', 1),
 ('CHAPTER', 12),
 ('I', 261),
 ('Down', 1),
 ('the', 1505),
 ('Rabbit-Hole', 1),
 ('Alice', 221),
 ('was', 328),
 ('beginning', 11),
 ('to', 703),
 ('get', 43),
 ('very', 126),
 ('tired', 7),
 ('of', 490),
 ('sitting', 10),
 ('by', 53),
 ('her', 203),
 ('sister', 5),
 ('on', 138),
 ('bank,', 2),
 ('and', 714),
 ('having', 10),
 ('nothing', 22),
 ('do:', 1),
 ('once', 18),
 ('or', 68),
 ('twice', 1),
 ('she', 484),
 ('had', 175),
 ('peeped', 3),
 ('into', 67),
 ('book', 3),
 ('reading,', 1),
 ('but', 102),
 ('it', 346),
 ('no', 64),
 ('pictures', 4),
 ('conversations', 1),
 ('in', 344),
 ('it,', 38),
 ('`and', 39),
 ('what', 85),
 ('is', 63),
 ('use', 16),
 ('a', 606),
 ("book,'", 2),
 ('thought', 63),
 ('`without', 1),
 ("conversation?'", 1),
 ('So', 23),
 ('considering', 3),
 ('own', 9),
 ('mind', 4),
 ('(

In [12]:
words.map(lambda word: (word,1)).reduceByKey(lambda x,y : x+y).sortByKey().collect()

[('\x1a', 1),
 ('"\'TIS', 1),
 ('"--SAID', 1),
 ('"Come', 1),
 ('"Coming', 1),
 ('"Edwin', 1),
 ('"French,', 1),
 ('"HOW', 1),
 ('"He\'s', 1),
 ('"How', 1),
 ('"I', 8),
 ('"I\'ll', 2),
 ('"Keep', 1),
 ('"Let', 1),
 ('"Such', 1),
 ('"THEY', 1),
 ('"There', 2),
 ('"There\'s', 1),
 ('"Too', 1),
 ('"Turtle', 1),
 ('"Twinkle,', 1),
 ('"Uglification,"\'', 1),
 ('"Up', 1),
 ('"What', 2),
 ('"Who', 1),
 ('"William', 1),
 ('"With', 1),
 ('"YOU', 1),
 ('"You', 2),
 ('"come', 1),
 ('"it"', 2),
 ('"much', 1),
 ('"poison"', 1),
 ('"purpose"?\'', 1),
 ("'em", 3),
 ("'tis", 2),
 ('(Alice', 4),
 ('(And,', 1),
 ('(As', 1),
 ('(Before', 1),
 ('(Dinah', 1),
 ('(For,', 1),
 ('(He', 1),
 ('(IF', 1),
 ('(In', 1),
 ('(It', 1),
 ('(Sounds', 1),
 ('(The', 3),
 ('(WITH', 1),
 ('(We', 1),
 ('(Which', 2),
 ('(`I', 2),
 ("(`That's", 1),
 ('(`the', 1),
 ('(`which', 1),
 ('(a', 1),
 ('(and', 1),
 ('(as', 2),
 ('(for', 1),
 ('(for,', 1),
 ('(it', 4),
 ('(look', 1),
 ('(luckily', 1),
 ('(not', 1),
 ('(or', 1),
 ('(poi

In [13]:
words.map(lambda word: (word,1)).reduceByKey(lambda x,y : x+y).sortByKey(False).collect()

[('zigzag,', 1),
 ("youth,'", 3),
 ('youth,', 3),
 ("yourself.'", 2),
 ("yourself,'", 1),
 ('yourself,', 1),
 ("yourself!'", 1),
 ('yourself', 5),
 ('yours."\'', 1),
 ('yours', 1),
 ('your', 53),
 ('young', 5),
 ("you?'", 7),
 ('you?', 2),
 ('you:', 1),
 ("you.'", 1),
 ('you.', 1),
 ('you--are', 1),
 ('you--all', 1),
 ("you,'", 6),
 ('you,', 25),
 ("you've", 5),
 ("you're", 15),
 ("you'll", 4),
 ("you'd", 8),
 ("you!'", 3),
 ('you!', 2),
 ('you', 252),
 ("yet?'", 2),
 ("yet.'", 2),
 ("yet--it's", 1),
 ('yet--and', 1),
 ('yet--Oh!', 1),
 ("yet,'", 4),
 ('yet,', 3),
 ("yet!'", 1),
 ('yet', 6),
 ('yesterday,', 1),
 ('yesterday', 2),
 ('yer', 4),
 ('yelp', 1),
 ('yelled', 1),
 ('years,', 1),
 ('year', 2),
 ('ye;', 1),
 ('yawning.', 1),
 ('yawning', 1),
 ('yawned', 2),
 ('yards', 1),
 ('wrote', 3),
 ('wrong,', 2),
 ("wrong!'", 1),
 ('wrong', 2),
 ('written', 6),
 ('writing-desks,', 1),
 ("writing-desk?'", 1),
 ('writing', 4),
 ('write', 5),
 ('wriggling', 1),
 ('wretched', 2),
 ('wrapping',