In [1]:
from pyspark.sql import SparkSession

# Create a Spark session with Hive support
spark = SparkSession.builder \
    .appName("ev data load to hive") \
    .config("spark.master", "spark://spark-master:7077") \
    .config("spark.jars", "/opt/spark/jars/hudi-spark3-bundle_2.12-1.0.0.jar") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive:9083") \
    .config("spark.executor.cores", "2")\
    .enableHiveSupport() \
    .getOrCreate()

# Query Hive
result = spark.sql("SHOW DATABASES")
result.show()

+---------+
|namespace|
+---------+
|  default|
|  test_db|
+---------+



In [13]:
result = spark.sql("SHOW DATABASES")
result.show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [14]:
spark.sql("CREATE DATABASE IF NOT EXISTS ev_data")

DataFrame[]

In [15]:
spark.catalog.setCurrentDatabase("ev_data")

In [3]:
df_read = spark.read.format("hudi").load("hdfs://namenode:9000/data/hudi/ev_data_cleaned")

In [16]:
df_read.write.mode("overwrite").saveAsTable("ev_data.ev_vehicles_cleaned")

In [18]:
spark.sql("SELECT * from ev_data.ev_vehicles_cleaned").show()

+-------------------+--------------------+------------------+----------------------+--------------------+--------------+------------+-------------+-----------+----------+--------+--------------+--------------------+--------------------+--------------+---------+--------------------+--------------------+----------+--------+--------------------+-----+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|vehical_number|      county|         city|postal_code|model_year|    make|         model|        vehical_type|    cavf_eligibility|electric_range|base_msrp|legislative_district|    electric_utility| longitude|latitude|          event_time|state|
+-------------------+--------------------+------------------+----------------------+--------------------+--------------+------------+-------------+-----------+----------+--------+--------------+--------------------+--------------------+--------------+---------+--------------------+----------------

In [7]:
df_read.printSchema()

root
 |-- _hoodie_commit_time: string (nullable = true)
 |-- _hoodie_commit_seqno: string (nullable = true)
 |-- _hoodie_record_key: string (nullable = true)
 |-- _hoodie_partition_path: string (nullable = true)
 |-- _hoodie_file_name: string (nullable = true)
 |-- vehical_number: string (nullable = true)
 |-- county: string (nullable = false)
 |-- city: string (nullable = false)
 |-- postal_code: integer (nullable = false)
 |-- model_year: integer (nullable = false)
 |-- make: string (nullable = true)
 |-- model: string (nullable = true)
 |-- vehical_type: string (nullable = true)
 |-- cavf_eligibility: string (nullable = true)
 |-- electric_range: integer (nullable = false)
 |-- base_msrp: integer (nullable = false)
 |-- legislative_district: integer (nullable = false)
 |-- electric_utility: string (nullable = false)
 |-- longitude: double (nullable = false)
 |-- latitude: double (nullable = false)
 |-- event_time: timestamp (nullable = false)
 |-- state: string (nullable = true)



In [45]:
#Top five model in WA state that are a cleaner alternative that people prefer
spark.sql("""
SELECT make, COUNT(vehical_number) as count
FROM  ev_data.ev_vehicles_cleaned
WHERE state = 'WA' AND cavf_eligibility LIKE 'Clean Alternative%'
GROUP BY make
ORDER BY  count DESC
LIMIT 5;
""").show()

+---------+-----+
|     make|count|
+---------+-----+
|    TESLA|  850|
|CHEVROLET|  626|
|    VOLVO|  581|
|      BMW|  323|
|      KIA|  311|
+---------+-----+

