In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Installation\\spark-3.3.2-bin-hadoop3'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("MultipleDatasetsApp")
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")
                
                .getOrCreate()
        )

sc = spark.sparkContext

spark

In [3]:
from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

## JOIN Operations

In [4]:
# Create schema for Yellow Taxi data
yellowTaxiSchema = (
                        StructType
                        ([ 
                            StructField("VendorId"               , IntegerType()   , True),
                            StructField("PickupTime"             , TimestampType() , True),
                            StructField("DropTime"               , TimestampType() , True),                            
                            StructField("PassengerCount"         , DoubleType()    , True),
                            StructField("TripDistance"           , DoubleType()    , True),
                            StructField("RateCodeId"             , DoubleType()    , True),                            
                            StructField("StoreAndFwdFlag"        , StringType()    , True),
                            StructField("PickupLocationId"       , IntegerType()   , True),
                            StructField("DropLocationId"         , IntegerType()   , True),                            
                            StructField("PaymentType"            , IntegerType()   , True),                            
                            StructField("FareAmount"             , DoubleType()    , True),
                            StructField("Extra"                  , DoubleType()    , True),
                            StructField("MtaTax"                 , DoubleType()    , True),
                            StructField("TipAmount"              , DoubleType()    , True),
                            StructField("TollsAmount"            , DoubleType()    , True),
                            StructField("ImprovementSurcharge"   , DoubleType()    , True),
                            StructField("TotalAmount"            , DoubleType()    , True),
                            StructField("CongestionSurcharge"    , DoubleType()    , True),
                            StructField("AirportFee"             , DoubleType()    , True)
                        ])
                   )


# Read Yellow Taxis file
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .schema(yellowTaxiSchema)    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )


# Create temp view
yellowTaxiDF.createOrReplaceTempView("YellowTaxis")


# Print the schema
yellowTaxiDF.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PassengerCount: double (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- RateCodeId: double (nullable = true)
 |-- StoreAndFwdFlag: string (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- PaymentType: integer (nullable = true)
 |-- FareAmount: double (nullable = true)
 |-- Extra: double (nullable = true)
 |-- MtaTax: double (nullable = true)
 |-- TipAmount: double (nullable = true)
 |-- TollsAmount: double (nullable = true)
 |-- ImprovementSurcharge: double (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- CongestionSurcharge: double (nullable = true)
 |-- AirportFee: double (nullable = true)



In [5]:
# Create schema for Taxi Zones data
taxiZonesSchema = "PickupLocationId INT, Borough STRING, Zone STRING, ServiceZone STRING"


# Read Taxi Zones file
taxiZonesDF = (
                  spark
                    .read                    
                    .schema(taxiZonesSchema)
                    .csv("C:\SparkCourse\DataFiles\Raw\TaxiZones.csv")
              )


# Create temp view
taxiZonesDF.createOrReplaceTempView("TaxiZones")


# Print the schema
taxiZonesDF.printSchema()

root
 |-- PickupLocationId: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- ServiceZone: string (nullable = true)



### Display location information for each ride

Join DataFrames of Yellow Taxis and Taxi Zones

In [6]:
joinedDF = (
                yellowTaxiDF

                    .join
                    (
                        taxiZonesDF,

                        yellowTaxiDF.PickupLocationId == taxiZonesDF.PickupLocationId, 
                        
                                  # [condition1, condition2...]

                        "inner"   # left, leftouter, right, rightouter, full etc.
                    )
            )
                    
joinedDF.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PassengerCount: double (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- RateCodeId: double (nullable = true)
 |-- StoreAndFwdFlag: string (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- PaymentType: integer (nullable = true)
 |-- FareAmount: double (nullable = true)
 |-- Extra: double (nullable = true)
 |-- MtaTax: double (nullable = true)
 |-- TipAmount: double (nullable = true)
 |-- TollsAmount: double (nullable = true)
 |-- ImprovementSurcharge: double (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- CongestionSurcharge: double (nullable = true)
 |-- AirportFee: double (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- ServiceZone: string (nullable

In [8]:
# Remove duplicate 'PickupLocationId' column

joinedDF = (
                yellowTaxiDF

                    .join
                    (
                        taxiZonesDF,

                        yellowTaxiDF.PickupLocationId == taxiZonesDF.PickupLocationId, 

                                  # [condition1, condition2...]

                        "inner"   # left, leftouter, right, rightouter, full etc.
                    )

                    .drop(col("tz.PickupLocationId"))
           )    
                    
joinedDF.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PassengerCount: double (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- RateCodeId: double (nullable = true)
 |-- StoreAndFwdFlag: string (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- PaymentType: integer (nullable = true)
 |-- FareAmount: double (nullable = true)
 |-- Extra: double (nullable = true)
 |-- MtaTax: double (nullable = true)
 |-- TipAmount: double (nullable = true)
 |-- TollsAmount: double (nullable = true)
 |-- ImprovementSurcharge: double (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- CongestionSurcharge: double (nullable = true)
 |-- AirportFee: double (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- ServiceZone: string (nullable

In [9]:
# Join on same column name
# Only one column shows up in output

joinedDF = (
                yellowTaxiDF.alias("yt")
    
                    .join
                    (
                        taxiZonesDF.alias("tz"),
                        
                        #col("yt.PickupLocationId") == col("tz.PickupLocationId")
                        
                        ['PickupLocationId'],   # only one PickupLocationId column will be kept
                        
                        "inner"
                    )
           )

joinedDF.printSchema()

root
 |-- PickupLocationId: integer (nullable = true)
 |-- VendorId: integer (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PassengerCount: double (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- RateCodeId: double (nullable = true)
 |-- StoreAndFwdFlag: string (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- PaymentType: integer (nullable = true)
 |-- FareAmount: double (nullable = true)
 |-- Extra: double (nullable = true)
 |-- MtaTax: double (nullable = true)
 |-- TipAmount: double (nullable = true)
 |-- TollsAmount: double (nullable = true)
 |-- ImprovementSurcharge: double (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- CongestionSurcharge: double (nullable = true)
 |-- AirportFee: double (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- ServiceZone: string (nullable = true)



### Exercise 1

Find all LocationIds in TaxiZones, from where no pickups have happened

In [None]:
# Exercise 1 Answer

(spark.sql("""

    SELECT DISTINCT tz.*

    FROM TaxiZones tz

        LEFT JOIN YellowTaxis yt ON yt.PickupLocationId = tz.PickupLocationId
        
        WHERE yt.PickupLocationId IS NULL

""")).show()

## SET Operations

In [10]:
# Read Drivers file
driversDF = (
                  spark
                    .read                    
                    .option("header", "true")
                    .option("inferSchema", "true")
                    .csv("C:\SparkCourse\DataFiles\Raw\Drivers.csv")
            )

# Create temp view
driversDF.createOrReplaceTempView("Drivers")


driversDF.show()

+-------------------+--------------------+--------------------+--------------+---------------+
|DriverLicenseNumber|                Name|                Type|ExpirationDate|LastDateUpdated|
+-------------------+--------------------+--------------------+--------------+---------------+
|            5430898|   ABDEL-BAR,ESLAM,M|MEDALLION TAXI DR...|    04/12/2023|     04/22/2020|
|            5363749|ABDOUSAMADOV,ALIC...|MEDALLION TAXI DR...|    06/01/2020|     04/22/2020|
|            5534446|  ABDUHALIKOV,RUSTAM|MEDALLION TAXI DR...|    06/16/2020|     04/22/2020|
|            5935702|   ABDULLAEV,JONIBEK|MEDALLION TAXI DR...|    03/14/2022|     04/22/2020|
|            5255097|ABDULNABI,MASHHOUR,H|MEDALLION TAXI DR...|    03/16/2021|     04/22/2020|
|            5778633|ABDUSALOMOV,IKROMJON|MEDALLION TAXI DR...|    06/02/2023|     04/22/2020|
|            5934755|ABDUVOKHIDOV,MURO...|MEDALLION TAXI DR...|    02/27/2022|     04/22/2020|
|             443085|         ABEDIN,MD,J|MEDALLIO

In [11]:
# Read Cabs file
cabsDF = (
              spark
                .read                    
                .option("header", "true")
                .option("inferSchema", "true")
                .csv("C:\SparkCourse\DataFiles\Raw\Cabs.csv")
         )

# Create temp view
cabsDF.createOrReplaceTempView("Cabs")


cabsDF.show()

+---------+--------------------+--------------------+----------------+------+-------------------+-----------------+--------------------+-----------+-----------+---------------+--------------------+--------------------+---------------+
|CabNumber|VehicleLicenseNumber|                Name|     LicenseType|Active|PermitLicenseNumber| VehicleVinNumber|WheelchairAccessible|VehicleYear|VehicleType|TelephoneNumber|             Website|             Address|LastDateUpdated|
+---------+--------------------+--------------------+----------------+------+-------------------+-----------------+--------------------+-----------+-----------+---------------+--------------------+--------------------+---------------+
| T802127C|              C19641|          ABCON INC.|OWNER MUST DRIVE|   YES|               null|5TDBK3EH0DS268018|                null|       2016|       null|  (718)438-1100|                null|41-24   38 STREET...|     04/22/2020|
| T525963C|             5362996| ACCEPTABLE TAXI LLC|    NAM

### 1. Create list of all drivers

In [12]:
(spark.sql("""

    (
        SELECT Name 
        FROM Cabs
        WHERE LicenseType = 'OWNER MUST DRIVE'
    )

    UNION ALL

    (
        SELECT Name
        FROM Drivers
    )

""")).count()

157716

### 2. Create list of unique drivers

In [13]:
(spark.sql("""

    (
        SELECT Name 
        FROM Cabs
        WHERE LicenseType = 'OWNER MUST DRIVE'
    )

    UNION

    (
        SELECT Name
        FROM Drivers
    )

""")).count()

156566

### 3. Create list of all registered drivers who are driving cabs

In [14]:
(spark.sql("""

    (
        SELECT Name 
        FROM Cabs
        WHERE LicenseType = 'OWNER MUST DRIVE'
    )

    INTERSECT

    (
        SELECT Name
        FROM Drivers
    )

""")).count()

1150

### 4. Create list of drivers driving cabs, but not registered

In [15]:
(spark.sql("""
    
    (
        SELECT Name 
        FROM Cabs
        WHERE LicenseType = 'OWNER MUST DRIVE'
    )

    EXCEPT

    (
        SELECT Name
        FROM Drivers
    )

""")).count()

1940