In [None]:
import findspark
findspark.init()
findspark.find()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = (
            SparkSession
                .builder
                .appName("RDDApp")
                .master("local[4]")
                .getOrCreate()
        )

In [3]:
spark

### Create RDD - Option 1: Parallelize

In [4]:
# Create a variable for SparkContext

sc = spark.sparkContext

In [5]:
# Create RDD using parallelize

numbersRdd = sc.parallelize( [1, 2, 3, 4, 5] )

In [6]:
# Check RDD partitions

numbersRdd.getNumPartitions()

4

In [7]:
# Get result from RDD

output = numbersRdd.collect()

print(output)

[1, 2, 3, 4, 5]


In [8]:
# Get any 2 records from RDD

numbersRdd.take(2)

[1, 2]

In [9]:
# Get first record from RDD
# first() returns an element, while take(1) returns an array with one element

numbersRdd.first()

1

In [10]:
# Create RDD with complex types

employeesRdd = sc.parallelize(
                                [
                                    [ 1, "Neha",  10000 ],
                                    [ 2, "Steve", 20000 ],
                                    [ 3, "Kari",  30000 ],
                                    [ 4, "Ivan",  40000 ],
                                    [ 5, "Mohit", 50000 ]
                                ]
                              )

In [11]:
# Get first record from RDD

employeesRdd.first()

[1, 'Neha', 10000]

### Create RDD - Option 2: Read a File

In [12]:
# Read TaxiZones.csv file and create RDD

taxiZonesRdd = sc.textFile("C:\development\my\python\pyspark-playground\DataFiles\Raw\TaxiZones.csv")

In [13]:
# Get 10 records

taxiZonesRdd.take(10)

['1,EWR,Newark Airport,EWR',
 '2,Queens,Jamaica Bay,Boro Zone',
 '3,Bronx,Allerton/Pelham Gardens,Boro Zone',
 '4,Manhattan,Alphabet City,Yellow Zone',
 '5,Staten Island,Arden Heights,Boro Zone',
 '6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone',
 '7,Queens,Astoria,Boro Zone',
 '8,Queens,Astoria Park,Boro Zone',
 '9,Queens,Auburndale,Boro Zone',
 '10,Queens,Baisley Park,Boro Zone']

In [14]:
# Check RDD partitions

taxiZonesRdd.getNumPartitions()

2

In [15]:
# Read TaxiZones.csv file and create RDD (with 4 partitions)

taxiZonesRdd = sc.textFile("C:\development\my\python\pyspark-playground\DataFiles\Raw\TaxiZones.csv", 4)

taxiZonesRdd.getNumPartitions()

4

### Create RDD - Option 3: Convert another RDD

In [16]:
# Apply map operation to split each record by comma, and get array of items

taxiZonesWithColsRdd = (
                            taxiZonesRdd
    
                                .map( lambda zone: zone.split(",") )
                       )

taxiZonesWithColsRdd.take(5)

[['1', 'EWR', 'Newark Airport', 'EWR'],
 ['2', 'Queens', 'Jamaica Bay', 'Boro Zone'],
 ['3', 'Bronx', 'Allerton/Pelham Gardens', 'Boro Zone'],
 ['4', 'Manhattan', 'Alphabet City', 'Yellow Zone'],
 ['5', 'Staten Island', 'Arden Heights', 'Boro Zone']]

In [17]:
# Apply filter operation
# Get data where Borough is 'Manhattan' and Zone begins with 'central'

filteredZonesRdd = (
                        taxiZonesWithColsRdd
    
                            .filter( lambda zoneRow: zoneRow[1] == "Manhattan" 

                                                 and zoneRow[2].lower().startswith("central")
                                   )
                   )

filteredZonesRdd.take(5)

[['41', 'Manhattan', 'Central Harlem', 'Boro Zone'],
 ['42', 'Manhattan', 'Central Harlem North', 'Boro Zone'],
 ['43', 'Manhattan', 'Central Park', 'Yellow Zone']]

In [18]:
# Get list of Zones with even LocationId

evenZoneIds = (
                    taxiZonesRdd
    
                        .filter( lambda zoneRow: int(zoneRow[0]) % 2 == 0 )
              )

evenZoneIds.take(5)

['2,Queens,Jamaica Bay,Boro Zone',
 '4,Manhattan,Alphabet City,Yellow Zone',
 '6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone',
 '8,Queens,Astoria Park,Boro Zone',
 '20,Bronx,Belmont,Boro Zone']

### Pair RDDs

In [19]:
import math

numbersRDD = sc.parallelize( [2, 3, 4, 5, 6] )

# Create Pair RDD
numsWithSquareRootRdd = (
                            numbersRDD
    
                                .map(lambda num: (
                                                    num,
                                                    math.sqrt(num)
                                                 )
                                    )
                        )

numsWithSquareRootRdd.collect()

[(2, 1.4142135623730951),
 (3, 1.7320508075688772),
 (4, 2.0),
 (5, 2.23606797749979),
 (6, 2.449489742783178)]

### Exercise 1

Use taxiZonesWithColsRdd and create a Pair RDD (LocationId as Key, Zone Details as Value)

In [None]:
# Exercise 1 Answer

taxiZonesPairRdd = (
                        taxiZonesWithColsRdd
    
                            .map(lambda zoneRow: ( 
                                                    zoneRow[0],
                                                    zoneRow
                                                 )
                                )
                   )

taxiZonesPairRdd.take(10)

### Calculate count of records for each Borough

In [20]:
# Create Pair RDD (locationId as Key, Value as 1)

taxiZonesPairRdd = (
                        taxiZonesWithColsRdd
    
                            .map(lambda zoneRow: ( 
                                                    zoneRow[1],   # Key   - Borough
                                                    1             # Value - 1
                                                 )
                                )
                   )

taxiZonesPairRdd.take(10)

[('EWR', 1),
 ('Queens', 1),
 ('Bronx', 1),
 ('Manhattan', 1),
 ('Staten Island', 1),
 ('Staten Island', 1),
 ('Queens', 1),
 ('Queens', 1),
 ('Queens', 1),
 ('Queens', 1)]

In [21]:
# Calculate count of records for each Borough

boroughCountRdd = (
                        taxiZonesPairRdd
    
                            .reduceByKey( lambda value1, value2: value1 + value2 )
                  )

boroughCountRdd.collect()

[('Bronx', 43),
 ('Staten Island', 20),
 ('EWR', 1),
 ('Manhattan', 69),
 ('Brooklyn', 61),
 ('Unknown', 2),
 ('Queens', 69)]

In [22]:
# Sort the data by Key

(
    taxiZonesPairRdd
    
        .sortByKey()
    
        .collect()
)

[('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn',

In [23]:
# Only return the keys

(
    taxiZonesPairRdd
    
        .keys()
    
        .take(10)
)

['EWR',
 'Queens',
 'Bronx',
 'Manhattan',
 'Staten Island',
 'Staten Island',
 'Queens',
 'Queens',
 'Queens',
 'Queens']

In [24]:
# Only return the values

(
    taxiZonesPairRdd
    
        .values()
    
        .take(10)
)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [25]:
# Return distinct pairs

(
    taxiZonesPairRdd
    
        .distinct()
    
        .collect()
)

[('Bronx', 1),
 ('Staten Island', 1),
 ('Queens', 1),
 ('EWR', 1),
 ('Manhattan', 1),
 ('Brooklyn', 1),
 ('Unknown', 1)]