In [1]:
import findspark
findspark.init()
findspark.find()

from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("WindowOperationsApp")
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")
                
                .getOrCreate()
        )

sc = spark.sparkContext

spark

In [3]:
# Create schema for Yellow Taxi data
yellowTaxiSchema = (
                        StructType
                        ([ 
                            StructField("VendorId"               , IntegerType()   , True),
                            StructField("PickupTime"             , TimestampType() , True),
                            StructField("DropTime"               , TimestampType() , True),
                            StructField("PassengerCount"         , DoubleType()    , True),
                            StructField("TripDistance"           , DoubleType()    , True),
                            StructField("RateCodeId"             , DoubleType()    , True),
                            StructField("StoreAndFwdFlag"        , StringType()    , True),
                            StructField("PickupLocationId"       , IntegerType()   , True),
                            StructField("DropLocationId"         , IntegerType()   , True),
                            StructField("PaymentType"            , IntegerType()   , True),
                            StructField("FareAmount"             , DoubleType()    , True),
                            StructField("Extra"                  , DoubleType()    , True),
                            StructField("MtaTax"                 , DoubleType()    , True),
                            StructField("TipAmount"              , DoubleType()    , True),
                            StructField("TollsAmount"            , DoubleType()    , True),
                            StructField("ImprovementSurcharge"   , DoubleType()    , True),
                            StructField("TotalAmount"            , DoubleType()    , True),
                            StructField("CongestionSurcharge"    , DoubleType()    , True),
                            StructField("AirportFee"             , DoubleType()    , True)
                        ])
                   )


# Read Yellow Taxis file
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .schema(yellowTaxiSchema)    
                    .csv("D:\DemoFiles\SparkCourseFiles\YellowTaxis_202210.csv")
               )


# Create temp view
yellowTaxiDF.createOrReplaceTempView("YellowTaxis")

In [4]:
# Create schema for Taxi Zones data
taxiZonesSchema = "PickupLocationId INT, Borough STRING, Zone STRING, ServiceZone STRING"


# Read Taxi Zones file
taxiZonesDF = (
                  spark
                    .read                    
                    .schema(taxiZonesSchema)
                    .csv("D:\DemoFiles\SparkCourseFiles\TaxiZones.csv")
              )


# Create temp view
taxiZonesDF.createOrReplaceTempView("TaxiZones")

## Windows - Find share of each borough in terms of rides

### 1. Get rides for each borough

In [5]:
taxiRidesDF = (
    
  spark.sql("""
                SELECT tz.Borough
                     , COUNT(*)     AS RideCount

                FROM TaxiZones tz
                    INNER JOIN YellowTaxis yt ON yt.PickupLocationId = tz.PickupLocationId

                GROUP BY tz.Borough
   """)
)

taxiRidesDF.createOrReplaceTempView("TaxiRides")

taxiRidesDF.orderBy("Borough").show()

+-------------+---------+
|      Borough|RideCount|
+-------------+---------+
|        Bronx|     4511|
|     Brooklyn|    28089|
|          EWR|     1157|
|    Manhattan|  3250695|
|       Queens|   333922|
|Staten Island|      303|
|      Unknown|    56735|
+-------------+---------+



### 2. Calculate total rides across all boroughs

a) Create Window over entire table <br/>
b) Add total rides (across all boroughs) against each row

In [6]:
taxiRidesWindowDF = (
    spark.sql("""
                    SELECT *
                         , SUM (RideCount)   OVER ()   AS TotalRideCount

                    FROM TaxiRides
    """)
)

taxiRidesWindowDF.orderBy("Borough").show()

taxiRidesWindowDF.createOrReplaceTempView("TaxiRidesWindow")

+-------------+---------+--------------+
|      Borough|RideCount|TotalRideCount|
+-------------+---------+--------------+
|        Bronx|     4511|       3675412|
|     Brooklyn|    28089|       3675412|
|          EWR|     1157|       3675412|
|    Manhattan|  3250695|       3675412|
|       Queens|   333922|       3675412|
|Staten Island|      303|       3675412|
|      Unknown|    56735|       3675412|
+-------------+---------+--------------+



### 3. Find share of each borough in terms of rides

Divide Borough's ride count with Total Ride Count (across all boroughs)

In [7]:
(spark.sql("""

            SELECT *
                  , ROUND( (RideCount * 100) / TotalRideCount, 2)   AS RidesSharePercent

            FROM TaxiRidesWindow
            ORDER BY Borough

""")).show()

+-------------+---------+--------------+-----------------+
|      Borough|RideCount|TotalRideCount|RidesSharePercent|
+-------------+---------+--------------+-----------------+
|        Bronx|     4511|       3675412|             0.12|
|     Brooklyn|    28089|       3675412|             0.76|
|          EWR|     1157|       3675412|             0.03|
|    Manhattan|  3250695|       3675412|            88.44|
|       Queens|   333922|       3675412|             9.09|
|Staten Island|      303|       3675412|             0.01|
|      Unknown|    56735|       3675412|             1.54|
+-------------+---------+--------------+-----------------+



### Window Partitions - Find share of each zone in terms of rides, within their borough

### 1. Get rides for each zone

Zone is part of Borough

In [8]:
taxiRidesDF = (
    spark.sql("""
                    SELECT tz.Borough
                         , tz.Zone
                         , COUNT(*)     AS RideCount

                    FROM TaxiZones tz
                        INNER JOIN YellowTaxis yt ON yt.PickupLocationId = tz.PickupLocationId

                    GROUP BY tz.Borough
                           , tz.Zone                        
    """)
)

taxiRidesDF.orderBy("Borough", "Zone").show(truncate=False)

taxiRidesDF.createOrReplaceTempView("TaxiRides")

+-------+--------------------------------+---------+
|Borough|Zone                            |RideCount|
+-------+--------------------------------+---------+
|Bronx  |Allerton/Pelham Gardens         |51       |
|Bronx  |Bedford Park                    |92       |
|Bronx  |Belmont                         |59       |
|Bronx  |Bronx Park                      |22       |
|Bronx  |Bronxdale                       |48       |
|Bronx  |City Island                     |11       |
|Bronx  |Claremont/Bathgate              |98       |
|Bronx  |Co-Op City                      |200      |
|Bronx  |Country Club                    |7        |
|Bronx  |Crotona Park                    |2        |
|Bronx  |Crotona Park East               |45       |
|Bronx  |East Concourse/Concourse Village|180      |
|Bronx  |East Tremont                    |113      |
|Bronx  |Eastchester                     |73       |
|Bronx  |Fordham South                   |52       |
|Bronx  |Highbridge                      |158 

### 2. Calculate total rides across each borough

a) Create Window over entire table, and partition by Borough <br/>
b) Add total rides (across all zones in a borough) against each row

In [9]:
taxiRidesWindowDF = (
  spark.sql("""
  
            SELECT *
                 , SUM (RideCount)     OVER (PARTITION BY Borough)    AS TotalRideCountByBorough

            FROM TaxiRides
  """)
)

taxiRidesWindowDF.orderBy("Borough", "Zone").show(truncate=False)

taxiRidesWindowDF.createOrReplaceTempView("TaxiRidesWindow")

+-------+--------------------------------+---------+-----------------------+
|Borough|Zone                            |RideCount|TotalRideCountByBorough|
+-------+--------------------------------+---------+-----------------------+
|Bronx  |Allerton/Pelham Gardens         |51       |4511                   |
|Bronx  |Bedford Park                    |92       |4511                   |
|Bronx  |Belmont                         |59       |4511                   |
|Bronx  |Bronx Park                      |22       |4511                   |
|Bronx  |Bronxdale                       |48       |4511                   |
|Bronx  |City Island                     |11       |4511                   |
|Bronx  |Claremont/Bathgate              |98       |4511                   |
|Bronx  |Co-Op City                      |200      |4511                   |
|Bronx  |Country Club                    |7        |4511                   |
|Bronx  |Crotona Park                    |2        |4511                   |

### 3. Find share of each zone in terms of rides, within their borough

Divide Zone's ride count with Borough Ride Count

In [10]:
(spark.sql("""

    SELECT *
          , ROUND( (RideCount * 100) / TotalRideCountByBorough, 2)   AS RidesSharePercentInBorough

    FROM TaxiRidesWindow
    ORDER BY Borough, Zone

""")).show(truncate=False)

+-------+--------------------------------+---------+-----------------------+--------------------------+
|Borough|Zone                            |RideCount|TotalRideCountByBorough|RidesSharePercentInBorough|
+-------+--------------------------------+---------+-----------------------+--------------------------+
|Bronx  |Allerton/Pelham Gardens         |51       |4511                   |1.13                      |
|Bronx  |Bedford Park                    |92       |4511                   |2.04                      |
|Bronx  |Belmont                         |59       |4511                   |1.31                      |
|Bronx  |Bronx Park                      |22       |4511                   |0.49                      |
|Bronx  |Bronxdale                       |48       |4511                   |1.06                      |
|Bronx  |City Island                     |11       |4511                   |0.24                      |
|Bronx  |Claremont/Bathgate              |98       |4511        