In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Program Files\\spark-3.4.0'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = (
            SparkSession
                .builder
                .appName("SparkConceptsApp")
                .master("local[4]")                
                .getOrCreate()
        )

sc = spark.sparkContext

spark

### Identify Entities

1. Narrow Transformations
2. Wide Transformations
3. Actions
4. Jobs
5. Stages
6. Tasks
7. Partitions

### 1. Read CSV File

<i>Identify transformations, actions, jobs, stages, partitions, tasks

In [3]:
# Read File - TaxiZones.csv
taxiZonesRdd = sc.textFile("C:\development\my\python\pyspark-playground\DataFiles\Raw\TaxiZones.csv", 4)

taxiZonesRdd.collect()

['1,EWR,Newark Airport,EWR',
 '2,Queens,Jamaica Bay,Boro Zone',
 '3,Bronx,Allerton/Pelham Gardens,Boro Zone',
 '4,Manhattan,Alphabet City,Yellow Zone',
 '5,Staten Island,Arden Heights,Boro Zone',
 '6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone',
 '7,Queens,Astoria,Boro Zone',
 '8,Queens,Astoria Park,Boro Zone',
 '9,Queens,Auburndale,Boro Zone',
 '10,Queens,Baisley Park,Boro Zone',
 '11,Brooklyn,Bath Beach,Boro Zone',
 '12,Manhattan,Battery Park,Yellow Zone',
 '13,Manhattan,Battery Park City,Yellow Zone',
 '14,Brooklyn,Bay Ridge,Boro Zone',
 '15,Queens,Bay Terrace/Fort Totten,Boro Zone',
 '16,Queens,Bayside,Boro Zone',
 '17,Brooklyn,Bedford,Boro Zone',
 '18,Bronx,Bedford Park,Boro Zone',
 '19,Queens,Bellerose,Boro Zone',
 '20,Bronx,Belmont,Boro Zone',
 '21,Brooklyn,Bensonhurst East,Boro Zone',
 '22,Brooklyn,Bensonhurst West,Boro Zone',
 '23,Staten Island,Bloomfield/Emerson Hill,Boro Zone',
 '24,Manhattan,Bloomingdale,Yellow Zone',
 '25,Brooklyn,Boerum Hill,Boro Zone',
 '26,Brooklyn,

### 2. Additional step: Split by comma

<i>Identify transformations, actions, jobs, stages, partitions, tasks

In [4]:
# Read File - TaxiZones.csv
taxiZonesRdd = sc.textFile("C:\development\my\python\pyspark-playground\DataFiles\TaxiZones.csv", 4)


# Map Operation - Split data by comma
taxiZonesWithColsRdd = taxiZonesRdd.map( lambda zone: zone.split(",") )


taxiZonesWithColsRdd.collect()

[['1', 'EWR', 'Newark Airport', 'EWR'],
 ['2', 'Queens', 'Jamaica Bay', 'Boro Zone'],
 ['3', 'Bronx', 'Allerton/Pelham Gardens', 'Boro Zone'],
 ['4', 'Manhattan', 'Alphabet City', 'Yellow Zone'],
 ['5', 'Staten Island', 'Arden Heights', 'Boro Zone'],
 ['6', 'Staten Island', 'Arrochar/Fort Wadsworth', 'Boro Zone'],
 ['7', 'Queens', 'Astoria', 'Boro Zone'],
 ['8', 'Queens', 'Astoria Park', 'Boro Zone'],
 ['9', 'Queens', 'Auburndale', 'Boro Zone'],
 ['10', 'Queens', 'Baisley Park', 'Boro Zone'],
 ['11', 'Brooklyn', 'Bath Beach', 'Boro Zone'],
 ['12', 'Manhattan', 'Battery Park', 'Yellow Zone'],
 ['13', 'Manhattan', 'Battery Park City', 'Yellow Zone'],
 ['14', 'Brooklyn', 'Bay Ridge', 'Boro Zone'],
 ['15', 'Queens', 'Bay Terrace/Fort Totten', 'Boro Zone'],
 ['16', 'Queens', 'Bayside', 'Boro Zone'],
 ['17', 'Brooklyn', 'Bedford', 'Boro Zone'],
 ['18', 'Bronx', 'Bedford Park', 'Boro Zone'],
 ['19', 'Queens', 'Bellerose', 'Boro Zone'],
 ['20', 'Bronx', 'Belmont', 'Boro Zone'],
 ['21', 'Brookl

In [5]:
# Check partitions after reading file & after map operation

print ("After reading file = " + str( taxiZonesRdd.getNumPartitions() ))

print ("After applying map = " + str( taxiZonesWithColsRdd.getNumPartitions() ))

After reading file = 4
After applying map = 4


### 3. Additional step: Create Pair RDD

<i>Identify transformations, actions, jobs, stages, partitions, tasks

In [None]:
# Read File - TaxiZones.csv
taxiZonesRdd = sc.textFile("D:\DemoFiles\SparkCourseFiles\TaxiZones.csv", 4)


# Map Operation - Split data by comma
taxiZonesWithColsRdd = taxiZonesRdd.map( lambda zone: zone.split(",") )


# Map Operation - Create Pair RDD with <Borough, 1>
taxiZonesPairRdd = taxiZonesWithColsRdd.map( lambda zoneRow: (zoneRow[1], 1) )


taxiZonesPairRdd.count()

### 4. Additional step: Find distinct records

<i>Identify transformations, actions, jobs, stages, partitions, tasks

In [6]:
# Read File - TaxiZones.csv
taxiZonesRdd = sc.textFile("D:\DemoFiles\SparkCourseFiles\TaxiZones.csv", 4)


# Map Operation - Split data by comma
taxiZonesWithColsRdd = taxiZonesRdd.map( lambda zone: zone.split(",") )


# Map Operation - Create Pair RDD with <Borough, 1>
taxiZonesPairRdd = taxiZonesWithColsRdd.map( lambda zoneRow: (zoneRow[1], 1) )


# Distinct Operation - Find distinct items
distinctZonesRdd = taxiZonesPairRdd.distinct()


distinctZonesRdd.collect()

[('Bronx', 1),
 ('Staten Island', 1),
 ('Queens', 1),
 ('EWR', 1),
 ('Manhattan', 1),
 ('Brooklyn', 1),
 ('Unknown', 1)]

### 5. Additional steps: 
#### a) Group by Borough
#### b.) Get rows with Borough count > 10

<i>Identify transformations, actions, jobs, stages, partitions, tasks

In [7]:
# Read File - TaxiZones.csv
taxiZonesRdd = sc.textFile("D:\DemoFiles\SparkCourseFiles\TaxiZones.csv", 4)


# Map Operation - Split data by comma
taxiZonesWithColsRdd = taxiZonesRdd.map( lambda zone: zone.split(",") )


# Map Operation - Create Pair RDD with <Borough, 1>
taxiZonesPairRdd = taxiZonesWithColsRdd.map( lambda zoneRow: (zoneRow[1], 1) )


# Distinct Operation - Find distinct items
distinctZonesRdd = taxiZonesPairRdd.distinct()


# ReduceByKey Operation - Group by Borough and calculate count
boroughCountRdd = distinctZonesRdd.reduceByKey( lambda value1, value2: value1 + value2 )


# Filter Operation - Get rows with Borough count > 10
filteredZonesRdd = boroughCountRdd.filter( lambda row: row[1] > 10 )


filteredZonesRdd.collect()

[]

### Run first and take methods

<i>Identify transformations, actions, jobs, stages, partitions, tasks

In [8]:
# Read File - TaxiZones.csv
taxiZonesRdd = sc.textFile("D:\DemoFiles\SparkCourseFiles\TaxiZones.csv", 4)


print ( taxiZonesPairRdd.first() )

print ( taxiZonesPairRdd.take(2) )

('EWR', 1)
[('EWR', 1), ('Queens', 1)]
