# 1. Import needed libraries #

In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, when, row_number, udf
from pyspark.sql.types import StringType
from pyspark.sql.window import Window

# 2. Create Spark Session #

In [2]:
# Create Spark session
spark = SparkSession.builder.appName("ET_Spark") \
                            .config('spark.cores.max', "16") \
                            .config("spark.executor.memory", "70g") \
                            .config("spark.driver.memory", "50g") \
                            .config("spark.memory.offHeap.enabled",True) \
                            .config("spark.memory.offHeap.size","16g") \
                            .config("spark.sql.warehouse.dir", "hdfs://localhost:9000/user/hive/warehouse")\
                            .enableHiveSupport() \
                            .getOrCreate()

24/01/31 00:36:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# 3. Create functions for Dimension and Fact #

In [3]:
# Define function for Extract data from HDFS and convert to RDD
def extract_data(hdfs_path):
    dataframe = spark.read.csv(hdfs_path, header = True, inferSchema = True)
    return dataframe

In [4]:
# Read data from HDFS
hdfs_path = "hdfs://localhost:9000/user/thanhphat/datalake/uberdata/*.csv"
df_uber = extract_data(hdfs_path)

# Drop duplicates data
df_uber = df_uber.dropDuplicates()

# Show 10 rows data from the begining
df_uber.show(10)



+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|  pickup_longitude|   pickup_latitude|RatecodeID|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       1| 2016-03-01 00:00:19|  2016-03-01 00:18:02|              1|          7.7|-74.01290130615233| 40.70807266235352|         1|    

                                                                                

In [5]:
def create_dim_datetime(df_base):
    # Choose column
    dim_datetime_df = df_base['tpep_pickup_datetime','tpep_dropoff_datetime']
    
    # Make an identiy
    w = Window().orderBy(lit('A'))
    
    # Create DateTimeID for Dim_DateTime
    dim_datetime_df = dim_datetime_df.withColumn('DateTimeId', row_number().over(w))
    
    # Move DateTimeID to the first column
    new_order_columns = ["DateTimeId", "tpep_pickup_datetime", "tpep_dropoff_datetime"]
    dim_datetime_df = dim_datetime_df.select(*[col(c) for c in new_order_columns])
    
    return dim_datetime_df

In [6]:
dim_datetime_df = create_dim_datetime(df_uber)
dim_datetime_df.show(5)

24/01/31 00:36:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 0

+----------+--------------------+---------------------+
|DateTimeId|tpep_pickup_datetime|tpep_dropoff_datetime|
+----------+--------------------+---------------------+
|         1| 2016-03-01 00:00:19|  2016-03-01 00:18:02|
|         2| 2016-03-01 00:00:32|  2016-03-01 00:08:59|
|         3| 2016-03-10 07:09:22|  2016-03-10 07:20:38|
|         4| 2016-03-10 07:10:25|  2016-03-10 07:19:30|
|         5| 2016-03-10 07:12:16|  2016-03-10 07:28:38|
+----------+--------------------+---------------------+
only showing top 5 rows



In [7]:
# Function for create Dim_Pickup
def create_dim_pickup(df_base):
    # Change error column name
    df_base = df_base.withColumnRenamed("pickup_longitude", "pickup_longtitude")
    
    # Choose column
    dim_pickup_df = df_base['pickup_longtitude', 'pickup_latitude']
    
    # Make an identity
    w = Window().orderBy(lit('A'))
    
    # Create PickUpID for Dim_Pickup
    dim_pickup_df = dim_pickup_df.withColumn('PickUpID', row_number().over(w))
    
    # Change column to the first
    new_order = ["PickUpID", "pickup_longtitude", "pickup_latitude"]
    dim_pickup_df = dim_pickup_df.select(*[col(c) for c in new_order])
    
    return dim_pickup_df

In [8]:
# Function for create Dim_Dropoff
def create_dim_dropoff(df_base):
    # Change error column name
    df_base = df_base.withColumnRenamed("dropoff_longitude", "dropoff_longtitude")
    
    # Choose column
    dim_dropoff_df = df_base['dropoff_longtitude', 'dropoff_latitude']
    
    # Make an identity
    w = Window().orderBy(lit('A'))
    
    # Create DropOffID for Dim_DropOff
    dim_dropoff_df = dim_dropoff_df.withColumn('DropOffID', row_number().over(w))
    
    # Change column to the first
    new_order = ['DropOffID', "dropoff_longtitude", "dropoff_latitude"]
    dim_dropoff_df = dim_dropoff_df.select(*[col(c) for c in new_order])
    
    return dim_dropoff_df

In [9]:
# Dim_Pickup
dim_pickup_df = create_dim_pickup(df_uber)

# Dim_Dropoff
dim_dropoff_df = create_dim_dropoff(df_uber)

In [10]:
dim_pickup_df.show(5)

24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------+------------------+------------------+
|PickUpID| pickup_longtitude|   pickup_latitude|
+--------+------------------+------------------+
|       1|-74.01290130615233| 40.70807266235352|
|       2|-73.97491455078125| 40.76174545288086|
|       3|      -73.96484375| 40.75577926635742|
|       4| -74.0048599243164|40.737674713134766|
|       5| -73.9735336303711|40.786991119384766|
+--------+------------------+------------------+
only showing top 5 rows



24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [11]:
dim_dropoff_df.show(5)

24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 00:36:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/01/31 0

+---------+------------------+------------------+
|DropOffID|dropoff_longtitude|  dropoff_latitude|
+---------+------------------+------------------+
|        1|-73.96126556396483| 40.64998245239258|
|        2| -73.9890899658203|40.732303619384766|
|        3|-73.98269653320312| 40.76639175415039|
|        4|-73.99419403076173| 40.72831344604492|
|        5|-73.99043273925781| 40.75632095336913|
+---------+------------------+------------------+
only showing top 5 rows



In [14]:
# Function for mapping dict data
def translate(mapping):
    def translate_(col):
        return mapping.get(col)
    return udf(translate_, StringType())

In [15]:
# Function for create Dim_RateCode
def create_dim_ratecode(df_base):
    # Choose column
    dim_ratecode_df = df_base[['RatecodeID']]
    
    # Drop duplicates
    dim_ratecode_df = dim_ratecode_df.dropDuplicates()
    
    # Order data
    dim_ratecode_df = dim_ratecode_df.orderBy("RatecodeID")
    
    # Make a list type of ratecode
    dict_ratecode_type = {
        1:"Standard rate",
        2:"JFK",
        3:"Newark",
        4:"Nassau or Westchester",
        5:"Negotiated fare",
        6:"Group ride"
    }
    
    # Map data from dict_ratecode_name
    dim_ratecode_df = dim_ratecode_df.withColumn("rate_code_type", translate(dict_ratecode_name)("RatecodeID"))
    
    return dim_ratecode_df

In [29]:
dim_ratecode_df = create_dim_ratecode(df_uber)
dim_ratecode_df.show(10)

+----------+--------------------+
|RatecodeID|      rate_code_name|
+----------+--------------------+
|         1|       Standard rate|
|         2|                 JFK|
|         3|              Newark|
|         4|Nassau or Westche...|
|         5|     Negotiated fare|
|         6|          Group ride|
+----------+--------------------+



In [27]:
# Function for create Dim_Payment
def create_dim_payment(df_base):
    # Choose colum
    dim_payment_df = df_base[['payment_type']]
    
    # Drop duplicates
    dim_payment_df = dim_payment_df.dropDuplicates()
    
    # Order data
    dim_payment_df = dim_payment_df.orderBy("payment_type")
    
    # Dict name of payment_type
    dict_payment_type_name = {
        1:"Credit card",
        2:"Cash",
        3:"No charge",
        4:"Dispute"
    }
    
    # Map data
    dim_payment_df = dim_payment_df.withColumn("payment_type_name", 
                                               translate(dict_payment_type_name)("payment_type"))
    
    return dim_payment_df

In [28]:
dim_payment_df = create_dim_payment(df_uber)
dim_payment_df.show(10)

+------------+-----------------+
|payment_type|payment_type_name|
+------------+-----------------+
|           1|      Credit card|
|           2|             Cash|
|           3|        No charge|
|           4|          Dispute|
+------------+-----------------+



In [30]:
# Function for create Fact_Table