# **1. Import necessary lib** #

In [1]:
# Import pyspark, SparkSession
import pyspark
from pyspark.sql import SparkSession

# Import SparkFunction
import pyspark.sql.functions as F
from pyspark.sql.types import *

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 3, Finished, Available)

In [2]:
# Create Spark Session
spark = SparkSession.builder.appName("ETL_Uber_Analytics") \
                            .config('spark.cores.max', "16") \
                            .config("spark.executor.memory", "70g") \
                            .config("spark.driver.memory", "50g") \
                            .config("spark.memory.offHeap.enabled",True) \
                            .config("spark.memory.offHeap.size","16g") \
                            .getOrCreate()

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 4, Finished, Available)

# **2. Extract data** #

In [3]:
# Define file_path
file_path = "Files/uber_data.csv"

# Read dataset
df_uber = spark.read.format("csv") \
                    .option("header", True) \
                    .option("inferSchema", True) \
                    .load(file_path)

# Show dataframe
df_uber.show(5)

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 5, Finished, Available)

+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|  pickup_longitude|   pickup_latitude|RatecodeID|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       1| 2016-03-01 00:00:00|  2016-03-01 00:07:55|              1|          2.5|-73.97674560546875|40.765151977539055|         1|    

# **3. Transformation data** #

In [4]:
# Drop Duplicates data
df_uber.dropDuplicates()

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 6, Finished, Available)

DataFrame[VendorID: int, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: int, trip_distance: double, pickup_longitude: double, pickup_latitude: double, RatecodeID: int, store_and_fwd_flag: string, dropoff_longitude: double, dropoff_latitude: double, payment_type: int, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double]

In [8]:
# Create function for dim_datetime, dim_pickup, dim_dropoff
def create_dim_withidcolumn(df_base, column_from_base, column_id, name_temp_view, query):
    # Choose column from uber
    dim_create_new_df = df_base[column_from_base]

    # Create column ID with Auto Increment
    dim_create_new_df = dim_create_new_df.withColumn(column_id, 
                                                F.monotonically_increasing_id() + 1)
    
    # Create temp view
    dim_create_new_df.createOrReplaceTempView(name_temp_view)

    # Query for change order column
    dim_create_new_df = spark.sql(query)

    # Drop Temp View
    spark.catalog.dropTempView(name_temp_view)

    return dim_create_new_df

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 10, Finished, Available)

## 3.1. Create Dim_DateTime ##

In [9]:
# Define parameter for Dim_DateTime
column_from_base = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']
column_id = "DateTimeID"
name_temp_view = "dim_datetime"
query = ''' SELECT DateTimeID, tpep_pickup_datetime, tpep_dropoff_datetime
            FROM dim_datetime '''

# Use function
dim_datetime_df = create_dim_withidcolumn(df_uber, column_from_base, 
                             column_id, name_temp_view, query)

# Show Dataframe
dim_datetime_df.show(5)

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 11, Finished, Available)

+----------+--------------------+---------------------+
|DateTimeID|tpep_pickup_datetime|tpep_dropoff_datetime|
+----------+--------------------+---------------------+
|         1| 2016-03-01 00:00:00|  2016-03-01 00:07:55|
|         2| 2016-03-01 00:00:00|  2016-03-01 00:11:06|
|         3| 2016-03-01 00:00:00|  2016-03-01 00:31:06|
|         4| 2016-03-01 00:00:00|  2016-03-01 00:00:00|
|         5| 2016-03-01 00:00:00|  2016-03-01 00:00:00|
+----------+--------------------+---------------------+
only showing top 5 rows



## 3.2. Create Dim_PickUp ##

In [10]:
# Define parameter for Dim_PickUp
column_from_base = ['pickup_longitude', 'pickup_latitude']
column_id = "PickUpID"
name_temp_view = "dim_pickup"
query = ''' SELECT PickUpID, pickup_longitude, pickup_latitude
            FROM dim_pickup '''

# Use function
dim_pickup_df = create_dim_withidcolumn(df_uber, column_from_base, 
                             column_id, name_temp_view, query)

# Show Dataframe
dim_pickup_df.show(5)

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 12, Finished, Available)

+--------+------------------+------------------+
|PickUpID|  pickup_longitude|   pickup_latitude|
+--------+------------------+------------------+
|       1|-73.97674560546875|40.765151977539055|
|       2|-73.98348236083984|40.767925262451165|
|       3|-73.78202056884764| 40.64480972290039|
|       4|-73.86341857910156|40.769813537597656|
|       5|-73.97174072265625| 40.79218292236328|
+--------+------------------+------------------+
only showing top 5 rows



## 3.3. Create Dim_DropOff ##

In [11]:
# Define parameter for Dim_DropOff
column_from_base = ['dropoff_longitude', 'dropoff_latitude']
column_id = "DropOffID"
name_temp_view = "dim_dropoff"
query = ''' SELECT DropOffID, dropoff_longitude, dropoff_latitude
            FROM dim_dropoff '''

# Use function
dim_dropoff_df = create_dim_withidcolumn(df_uber, column_from_base, 
                             column_id, name_temp_view, query)

# Show Dataframe
dim_dropoff_df.show(5)

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 13, Finished, Available)

+---------+------------------+------------------+
|DropOffID| dropoff_longitude|  dropoff_latitude|
+---------+------------------+------------------+
|        1|-74.00426483154298| 40.74612808227539|
|        2|-74.00594329833984|  40.7331657409668|
|        3|-73.97454071044923|  40.6757698059082|
|        4|-73.96965026855469|40.757766723632805|
|        5|-74.17716979980467| 40.69505310058594|
+---------+------------------+------------------+
only showing top 5 rows



## 3.4. Create Dim_RateCode ##

In [12]:
# Choose column
dim_ratecode_df = df_uber[['RateCodeID']]

# Drop duplicates values
dim_ratecode_df = dim_ratecode_df.dropDuplicates()

# List data with dictionary
dict_ratecode_type = {
    '1':"Standard rate",
    '2':"JFK",
    '3':"Newark",
    '4':"Nassau or Westchester",
    '5':"Negotiated fare",
    '6':"Group ride"
}
    
    # Convert dict_value to a list of tuples
data_list = list(dict_ratecode_type.items())

# Define Schema
schema = StructType([
    StructField('RateCodeID', StringType(), True),
    StructField('rate_code_type', StringType(), True)
])

# Convert to DataFrame
df_dict = spark.createDataFrame(data = data_list, schema = schema)

# Change Order of Dataframe
    # Create TempView
dim_ratecode_df.createOrReplaceTempView("Dim_RateCode")
df_dict.createOrReplaceTempView("Dim_Dict")
    
    # Join two tables
dim_ratecode_df = spark.sql(''' 
                SELECT DC.RateCodeID, DD.rate_code_type
                FROM Dim_RateCode DC join Dim_Dict DD on DC.RateCodeID = DD.RateCodeID
                ''')
                    
    # Drop Temp View
spark.catalog.dropTempView("Dim_RateCode")
spark.catalog.dropTempView("Dim_Dict")

# Show Dataframe
dim_ratecode_df.show(5)

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 14, Finished, Available)

+----------+--------------------+
|RateCodeID|      rate_code_type|
+----------+--------------------+
|         1|       Standard rate|
|         2|                 JFK|
|         3|              Newark|
|         4|Nassau or Westche...|
|         5|     Negotiated fare|
+----------+--------------------+
only showing top 5 rows



## 3.5. Create Dim_Payment ##

# 4. Load to table(Lakehouse) #

In [14]:
# Create function for save as table
def save_table_lakehouse(tables_name, df_save_table, format_table):
    # Define delta_path
    delta_path = f"Tables/{tables_name}"

    # Save as table
    df_save_table.write.format(format_table).mode("overwrite").save(delta_path)

StatementMeta(, 6da1c0f3-f991-4d11-ac70-1bbb95d9abb9, 16, Finished, Available)