### Creating a base RDD and Pair RDDs

#### [1]Create a base RDD

In [2]:
from pyspark import SparkConf,SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))
pyList = ['hello','Hi','fine','well','ok']
wordRDD = sc.parallelize(pyList,4)
print(type(wordRDD))

<class 'pyspark.rdd.RDD'>


In [3]:
def salutation(word):
    return word + " " +"Vinod Vukkalam"

In [4]:
salutationRDD = wordRDD.map(salutation)
print(salutationRDD.collect())

['hello Vinod Vukkalam', 'Hi Vinod Vukkalam', 'fine Vinod Vukkalam', 'well Vinod Vukkalam', 'ok Vinod Vukkalam']


In [5]:
salutationLengthRDD = (salutationRDD.map(lambda x:len(x))).collect()
print(salutationLengthRDD)

[20, 17, 19, 19, 17]


#### [2]. Pair RDDs

The next step is to create a new pair of RDD called pair RDD. A pair RDD is one which has an element in a pair tuple (k,v) ie key value pair

In [6]:
wordPairRDD = wordRDD.map(lambda x:(x,1))
print(wordPairRDD.collect())

[('hello', 1), ('Hi', 1), ('fine', 1), ('well', 1), ('ok', 1)]


In [7]:
newList = ['cat','cat','cat','Dog','Dog','Dog','Dog','Dog','Rat','Rat']
newListRDD = sc.parallelize(newList)
print(type(newListRDD))

<class 'pyspark.rdd.RDD'>


In [8]:
newWordPairRDD = newListRDD.map(lambda x:(x,1))
print(newWordPairRDD.collect())

[('cat', 1), ('cat', 1), ('cat', 1), ('Dog', 1), ('Dog', 1), ('Dog', 1), ('Dog', 1), ('Dog', 1), ('Rat', 1), ('Rat', 1)]


### Part 2: Counting with pair RDDs

#### Group By Key Approach

In [12]:
wordGroupedByKeyRDD = newWordPairRDD.groupByKey()


In [13]:
for key,value in wordGroupedByKeyRDD.collect():
    print("{0} : {1}".format(key,list(value)))

cat : [1, 1, 1]
Dog : [1, 1, 1, 1, 1]
Rat : [1, 1]


In [14]:
for key,value in wordGroupedByKeyRDD.collect():
    print("{0} : {1}".format(key,sum(value)))

cat : 3
Dog : 5
Rat : 2


In [15]:
%%writefile helloword.txt
How are you
Where are you
What are you doing
How good do you think you are 

Writing helloword.txt


In [16]:
text_file = sc.textFile("helloword.txt")

In [17]:
counts = text_file.map(lambda x:(x,1)).groupByKey()

In [18]:
for key,value in counts.collect():
    print("{0} : {1}".format(key,sum(value)))

How are you : 1
Where are you : 1
What are you doing : 1
How good do you think you are  : 1


### Word Count Program

In [19]:
textfile = sc.textFile("helloword.txt")

In [21]:
counts = textfile.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1)).groupByKey()

In [22]:
for key,value in counts.collect():
    print("{0} : {1}".format(key,sum(value)))

How : 2
are : 4
you : 5
Where : 1
What : 1
doing : 1
good : 1
do : 1
think : 1
 : 1
