# Creating Spark Session

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder\
.master('local[1]')\
.appName('SparkSession')\
.getOrCreate()

# Reading RDD

In [8]:
rdd = spark.sparkContext.textFile("./test.txt")

## flatMap()

In [18]:
rdd2 = rdd.flatMap(lambda x : x.split(" "))

In [19]:
rdd2.collect()

['Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'This',
 'eBook',
 'is',
 

## map()

In [17]:
rdd3 = rdd2.map(lambda x : (x,1))

In [20]:
rdd3.collect()

[('Project', 1),
 ('Gutenberg’s', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('Project', 1),
 ('Gutenberg’s', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1)

## reduceByKey()

In [22]:
rdd4 = rdd3.reduceByKey(lambda x, y: x+y)

In [23]:
rdd4.collect()

[('Project', 9),
 ('Gutenberg’s', 9),
 ('Alice’s', 18),
 ('Adventures', 18),
 ('in', 18),
 ('Wonderland', 18),
 ('by', 18),
 ('Lewis', 18),
 ('Carroll', 18),
 ('This', 27),
 ('eBook', 27),
 ('is', 27),
 ('for', 27),
 ('the', 27),
 ('use', 27),
 ('of', 27),
 ('anyone', 27),
 ('anywhere', 27),
 ('at', 27),
 ('no', 27),
 ('cost', 27),
 ('and', 27),
 ('with', 27)]

## sortByKey()

In [24]:
rdd5 = rdd4.map(lambda x : (x[1], x[0])).sortByKey()
print(rdd5.collect())

[(9, 'Project'), (9, 'Gutenberg’s'), (18, 'Alice’s'), (18, 'Adventures'), (18, 'in'), (18, 'Wonderland'), (18, 'by'), (18, 'Lewis'), (18, 'Carroll'), (27, 'This'), (27, 'eBook'), (27, 'is'), (27, 'for'), (27, 'the'), (27, 'use'), (27, 'of'), (27, 'anyone'), (27, 'anywhere'), (27, 'at'), (27, 'no'), (27, 'cost'), (27, 'and'), (27, 'with')]


## filter()

In [34]:
rdd6 = rdd5.filter(lambda x: 'any' in x[1])

In [35]:
print(rdd6.collect())

[(27, 'anyone'), (27, 'anywhere')]


# RDD Actions

## Count

In [38]:
rdd_count = rdd5.count()
print("Count :", rdd_count)

Count : 23


## First

In [41]:
rdd_first = rdd5.first()
print("First Record :", str(rdd_first[0]), ",",str(rdd_first[1]))

First Record : 9 , Project


## Max

In [48]:
rdd_max = rdd5.max()
print("The Max element:", str(rdd_max[0]), ",", str(rdd_max[1]))

The Max element: 27 , with


## Reduce

In [52]:
total_count = rdd5.reduce(lambda a,b: (a[0]+b[0], a[1]))
print(str(total_count[0]))

522


## take

In [55]:
data3 = rdd5.take(3)
for f in data3:
    print(str(f[0]), f[1])

9 Project
9 Gutenberg’s
18 Alice’s
