In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Installation\\spark-3.3.2-bin-hadoop3'

In [2]:
from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("SparkTablesApp")
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")
    
                # Enable Hive support
                .enableHiveSupport()
                
                .getOrCreate()
        )

sc = spark.sparkContext

spark

In [4]:
# Create schema for Yellow Taxi data
yellowTaxiSchema = (
                        StructType
                        ([ 
                            StructField("VendorId"               , IntegerType()   , True),
                            StructField("lpep_pickup_datetime"   , TimestampType() , True),
                            StructField("lpep_dropoff_datetime"  , TimestampType() , True),
                            StructField("passenger_count"        , DoubleType()    , True),
                            StructField("trip_distance"          , DoubleType()    , True),
                            StructField("RatecodeID"             , DoubleType()    , True),
                            StructField("store_and_fwd_flag"     , StringType()    , True),
                            StructField("PULocationID"           , IntegerType()   , True),
                            StructField("DOLocationID"           , IntegerType()   , True),
                            StructField("payment_type"           , IntegerType()   , True),
                            StructField("fare_amount"            , DoubleType()    , True),
                            StructField("extra"                  , DoubleType()    , True),
                            StructField("mta_tax"                , DoubleType()    , True),
                            StructField("tip_amount"             , DoubleType()    , True),
                            StructField("tolls_amount"           , DoubleType()    , True),
                            StructField("improvement_surcharge"  , DoubleType()    , True),
                            StructField("total_amount"           , DoubleType()    , True),
                            StructField("congestion_surcharge"   , DoubleType()    , True),
                            StructField("airport_fee"            , DoubleType()    , True)
                        ])
                   )

# Read YellowTaxis CSV file
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .schema(yellowTaxiSchema)    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

yellowTaxiDF.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



### Create Database in Hive metastore

In [5]:
spark.sql("""

SHOW DATABASES

""").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [6]:
spark.sql("""

CREATE DATABASE IF NOT EXISTS TaxisDB

""")

DataFrame[]

In [7]:
spark.sql("""

SHOW DATABASES

""").show()

+---------+
|namespace|
+---------+
|  default|
|  taxisdb|
+---------+



### Save DataFrame as a Managed Spark Table in Hive

In [8]:
(
    yellowTaxiDF
            .write
            
            .mode("overwrite")
    
            .saveAsTable("TaxisDB.YellowTaxisManaged")
)

In [9]:
spark.sql("""

SHOW TABLES IN TaxisDB

""").show(50, truncate=False)

+---------+------------------+-----------+
|namespace|tableName         |isTemporary|
+---------+------------------+-----------+
|taxisdb  |yellowtaxismanaged|false      |
+---------+------------------+-----------+



### Run queries on Managed Spark Table

In [10]:
spark.sql("""

SELECT *

FROM TaxisDB.YellowTaxisManaged

LIMIT 10

""").show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2022-10-09 18:33:31|  2022-10-09 19:06:57|            1.0|         2.65|       1.0|                 N|         151|         142|           1|       21.0|  0.0|    0.5|      4.8

In [11]:
outputDF = (
                spark
                    .read
                    .table("TaxisDB.YellowTaxisManaged")
           )

outputDF.limit(10).show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2022-10-09 18:33:31|  2022-10-09 19:06:57|            1.0|         2.65|       1.0|                 N|         151|         142|           1|       21.0|  0.0|    0.5|      4.8

In [12]:
spark.sql("""

DESCRIBE TABLE EXTENDED TaxisDB.YellowTaxisManaged

""").show(50, truncate=False)

+----------------------------+------------------------------------------------------------------+-------+
|col_name                    |data_type                                                         |comment|
+----------------------------+------------------------------------------------------------------+-------+
|VendorId                    |int                                                               |null   |
|lpep_pickup_datetime        |timestamp                                                         |null   |
|lpep_dropoff_datetime       |timestamp                                                         |null   |
|passenger_count             |double                                                            |null   |
|trip_distance               |double                                                            |null   |
|RatecodeID                  |double                                                            |null   |
|store_and_fwd_flag          |string          

### Save DataFrame as an Unmanaged / External Spark Table in Hive

Only change is to define the path

In [13]:
(
    yellowTaxiDF
            .write
            
            .mode("overwrite")
    
            .option("path", "C:\SparkCourse\DataFiles\Output\YellowTaxisOutput.parquet")
    
            #.option("format", "csv")             # Default is 'parquet'
    
            .saveAsTable("TaxisDB.YellowTaxis")
)

In [14]:
spark.sql("""

DESCRIBE TABLE EXTENDED TaxisDB.YellowTaxis

""").show(50, truncate=False)

+----------------------------+---------------------------------------------------------------+-------+
|col_name                    |data_type                                                      |comment|
+----------------------------+---------------------------------------------------------------+-------+
|VendorId                    |int                                                            |null   |
|lpep_pickup_datetime        |timestamp                                                      |null   |
|lpep_dropoff_datetime       |timestamp                                                      |null   |
|passenger_count             |double                                                         |null   |
|trip_distance               |double                                                         |null   |
|RatecodeID                  |double                                                         |null   |
|store_and_fwd_flag          |string                                     

### Drop External table and recreate using stored files

In [15]:
spark.sql("""

DROP TABLE TaxisDB.YellowTaxis

""")

DataFrame[]

In [16]:
spark.sql("""

CREATE TABLE TaxisDB.YellowTaxis

USING PARQUET

LOCATION "C:/SparkCourse/DataFiles/Output/YellowTaxisOutput.parquet/"

""")

DataFrame[]

In [17]:
yellowTaxiDF = (
                  spark
                    .read
                    .table("TaxisDB.YellowTaxis")
    
               )

yellowTaxiDF.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2022-10-09 18:33:31|  2022-10-09 19:06:57|            1.0|         2.65|       1.0|                 N|         151|         142|           1|       21.0|  0.0|    0.5|      4.8