In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Project3") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.bindAddress", "localhost") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/21 10:18:54 WARN Utils: Your hostname, thorsten-pc, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/09/21 10:18:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/21 10:18:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Bronze

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, BooleanType, TimestampType

schema = StructType([
    StructField("rider_id", IntegerType(), False),
    StructField("first", StringType(), True),
    StructField("last", StringType(), True),
    StructField("address", StringType(), True),
    StructField("birthday", DateType(), True),
    StructField("account_start_date", DateType(), True),
    StructField("account_end_date", DateType(), True),
    StructField("is_member", BooleanType(), True)
])

bronze_riders = spark.read.csv(
    'data/riders.csv', 
    schema=schema
)
bronze_riders.printSchema()

root
 |-- rider_id: integer (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: string (nullable = true)
 |-- birthday: date (nullable = true)
 |-- account_start_date: date (nullable = true)
 |-- account_end_date: date (nullable = true)
 |-- is_member: boolean (nullable = true)



In [3]:
schema = StructType([
    StructField("payment_id", IntegerType(), False),
    StructField("date", DateType(), True),
    StructField("amount", DoubleType(), True),
    StructField("rider_id", IntegerType(), True)
])
bronze_payments = spark.read.csv(
    'data/payments.csv', 
    schema=schema
)
bronze_payments.show(5)
bronze_payments.printSchema()

+----------+----------+------+--------+
|payment_id|      date|amount|rider_id|
+----------+----------+------+--------+
|         1|2019-05-01|   9.0|    1000|
|         2|2019-06-01|   9.0|    1000|
|         3|2019-07-01|   9.0|    1000|
|         4|2019-08-01|   9.0|    1000|
|         5|2019-09-01|   9.0|    1000|
+----------+----------+------+--------+
only showing top 5 rows
root
 |-- payment_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- amount: double (nullable = true)
 |-- rider_id: integer (nullable = true)



In [4]:
schema = StructType([
    StructField("station_id", StringType(), False),
    StructField("name", StringType(), False),
    StructField("latitude", DoubleType(), False),
    StructField("longitude", DoubleType(), False)
])

bronze_stations = spark.read.csv(
    'data/stations.csv', 
    schema=schema
)
bronze_stations.show(5)
bronze_stations.printSchema()

+------------+--------------------+-----------------+------------------+
|  station_id|                name|         latitude|         longitude|
+------------+--------------------+-----------------+------------------+
|         525|Glenwood Ave & To...|        42.012701|-87.66605799999999|
|KA1503000012|  Clark St & Lake St|41.88579466666667|-87.63110066666668|
|         637|Wood St & Chicago...|        41.895634|        -87.672069|
|       13216|  State St & 33rd St|       41.8347335|       -87.6258275|
|       18003|Fairbanks St & Su...|41.89580766666667|-87.62025316666669|
+------------+--------------------+-----------------+------------------+
only showing top 5 rows
root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [5]:
schema = StructType([
    StructField("trip_id", StringType(), False),
    StructField("rideable_type", StringType(), True),
    StructField("start_at", TimestampType(), True),
    StructField("ended_at", TimestampType(), True),
    StructField("start_station_id", StringType(), True),
    StructField("end_station_id", StringType(), True),
    StructField("rider_id", IntegerType(), True)
])

bronze_trips = spark.read.csv(
    'data/trips.csv',
    schema=schema
)
bronze_trips.show(5)
bronze_trips.printSchema()

+----------------+-------------+-------------------+-------------------+----------------+--------------+--------+
|         trip_id|rideable_type|           start_at|           ended_at|start_station_id|end_station_id|rider_id|
+----------------+-------------+-------------------+-------------------+----------------+--------------+--------+
|89E7AA6C29227EFF| classic_bike|2021-02-12 16:14:56|2021-02-12 16:21:43|             525|           660|   71934|
|0FEFDE2603568365| classic_bike|2021-02-14 17:52:38|2021-02-14 18:12:09|             525|         16806|   47854|
|E6159D746B2DBB91|electric_bike|2021-02-09 19:10:18|2021-02-09 19:19:10|    KA1503000012|  TA1305000029|   70870|
|B32D3199F1C2E75B| classic_bike|2021-02-02 17:49:41|2021-02-02 17:54:06|             637|  TA1305000034|   58974|
|83E463F23575F4BF|electric_bike|2021-02-23 15:07:23|2021-02-23 15:22:37|           13216|  TA1309000055|   39608|
+----------------+-------------+-------------------+-------------------+----------------

In [24]:
bronze_payments.write.mode("overwrite").format('csv').save('datalake/bronze_payments')
bronze_stations.write.mode("overwrite").format('csv').save('datalake/bronze_stations')
bronze_riders.write.mode("overwrite").format('csv').save('datalake/bronze_riders')
bronze_trips.write.mode("overwrite").format('csv').save('datalake/bronze_trips')

                                                                                

# Gold

In [6]:
bronze_riders.createOrReplaceTempView("Bronze_Riders")
bronze_payments.createOrReplaceTempView("Bronze_Payments")
bronze_stations.createOrReplaceTempView("Bronze_Stations")
bronze_trips.createOrReplaceTempView("Bronze_Trips")

### Dimension Station

In [7]:
%%file sql/dim_station.sql

SELECT 
    station_id,
    name,
    latitude,
    longitude
FROM Bronze_Stations 

Overwriting sql/dim_station.sql


In [8]:
with open('sql/dim_station.sql', 'r') as f:
    gold_dim_station = spark.sql(f.read())

gold_dim_station.show(5)
gold_dim_station.printSchema()
gold_dim_station

+------------+--------------------+-----------------+------------------+
|  station_id|                name|         latitude|         longitude|
+------------+--------------------+-----------------+------------------+
|         525|Glenwood Ave & To...|        42.012701|-87.66605799999999|
|KA1503000012|  Clark St & Lake St|41.88579466666667|-87.63110066666668|
|         637|Wood St & Chicago...|        41.895634|        -87.672069|
|       13216|  State St & 33rd St|       41.8347335|       -87.6258275|
|       18003|Fairbanks St & Su...|41.89580766666667|-87.62025316666669|
+------------+--------------------+-----------------+------------------+
only showing top 5 rows
root
 |-- station_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



DataFrame[station_id: string, name: string, latitude: double, longitude: double]

### Dimension Rider

In [9]:
%%file sql/dim_rider.sql

SELECT 
    rider_id, 
    address,
    first as first_name,
    last as last_name,
    birthday,
    is_member,
    account_start_date,
    account_end_date
FROM Bronze_Riders 

Overwriting sql/dim_rider.sql


In [10]:
with open('sql/dim_rider.sql', 'r') as f:
    gold_dim_rider = spark.sql(f.read())

gold_dim_rider.show(5)
gold_dim_rider.printSchema()

+--------+--------------------+----------+---------+----------+---------+------------------+----------------+
|rider_id|             address|first_name|last_name|  birthday|is_member|account_start_date|account_end_date|
+--------+--------------------+----------+---------+----------+---------+------------------+----------------+
|    1000| 1200 Alyssa Squares|     Diana|    Clark|1989-02-13|     true|        2019-04-23|            NULL|
|    1001|     397 Diana Ferry|  Jennifer|    Smith|1976-08-10|     true|        2019-11-01|      2020-09-01|
|    1002|644 Brittany Row ...|     Karen|    Smith|1998-08-10|     true|        2022-02-04|            NULL|
|    1003|996 Dickerson Tur...|     Bryan|  Roberts|1999-03-29|    false|        2019-08-26|            NULL|
|    1004|7009 Nathan Expre...|     Jesse|Middleton|1969-04-11|     true|        2019-09-14|            NULL|
+--------+--------------------+----------+---------+----------+---------+------------------+----------------+
only showi

### Dimension Time

In [11]:
%%file sql/dim_time.sql

SELECT DISTINCT
    date_format(a_timestamp, 'HHmmss') as time_string,
    CAST(a_timestamp AS timestamp) as time,
    hour(a_timestamp) as hour,
    minute(a_timestamp) as minute,
    second(a_timestamp) as second
FROM (
    SELECT start_at as a_timestamp FROM Bronze_Trips
    UNION ALL
    SELECT ended_at as a_timestamp FROM Bronze_Trips
) tmp

Overwriting sql/dim_time.sql


In [12]:
with open('sql/dim_time.sql', 'r') as f:
    gold_dim_time = spark.sql(f.read())

gold_dim_time.show(5)
gold_dim_time.printSchema()



+-----------+-------------------+----+------+------+
|time_string|               time|hour|minute|second|
+-----------+-------------------+----+------+------+
|     205002|2021-02-23 20:50:02|  20|    50|     2|
|     154934|2021-02-26 15:49:34|  15|    49|    34|
|     204034|2021-02-03 20:40:34|  20|    40|    34|
|     155241|2021-02-26 15:52:41|  15|    52|    41|
|     165447|2021-02-28 16:54:47|  16|    54|    47|
+-----------+-------------------+----+------+------+
only showing top 5 rows
root
 |-- time_string: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- second: integer (nullable = true)



                                                                                

### Dimension Date

In [13]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType
import datetime

def last_day_of_quarter(dt):
    if dt is None:
        return None
    q_month = ((dt.month - 1) // 3 + 1) * 3
    if q_month == 12:
        return datetime.date(dt.year, 12, 31)
    next_month_first = datetime.date(dt.year, q_month + 1, 1)
    return next_month_first - datetime.timedelta(days=1)

spark.udf.register("last_day_of_quarter_udf", last_day_of_quarter, DateType())

<function __main__.last_day_of_quarter(dt)>

In [14]:
%%file sql/dim_date.sql

SELECT DISTINCT
    date_format(a_date, 'yyyyMMdd') AS date_string,
    CAST(a_date AS date) AS date,
    year(a_date) AS year,
    quarter(a_date) AS quarter,
    month(a_date) AS month,
    weekofyear(a_date) AS week,
    day(a_date) AS day,
    dayofweek(a_date) AS weekday,
    date_format(a_date, 'EEEE') AS weekday_name,
    date_format(a_date, 'MMMM') AS month_name,
    trunc(a_date, 'YEAR') AS first_of_year,
    date_add(trunc(a_date, 'YEAR'), CASE 
        WHEN ((year(a_date) % 4 = 0 AND year(a_date) % 100 <> 0) OR year(a_date) % 400 = 0) THEN 366
        ELSE 365
    END - 1) AS last_of_year,
    trunc(a_date, 'QUARTER') AS first_of_quarter,
    last_day_of_quarter_udf(a_date) AS last_of_quarter,
    trunc(a_date, 'MONTH') AS first_of_month,
    last_day(a_date) AS last_of_month,
    ((year(a_date) % 4 = 0 AND year(a_date) % 100 <> 0) OR year(a_date) % 400 = 0) AS is_leap_year,
    (dayofweek(a_date) IN (1,7)) AS is_weekend
FROM (
    SELECT start_at AS a_date FROM Bronze_Trips
    UNION ALL
    SELECT ended_at AS a_date FROM Bronze_Trips
    UNION ALL
    SELECT date AS a_date FROM Bronze_Payments
) tmp

Overwriting sql/dim_date.sql


In [15]:
with open('sql/dim_date.sql', 'r') as f:
    gold_dim_date = spark.sql(f.read())

gold_dim_date.show(5)
gold_dim_date.printSchema()



+-----------+----------+----+-------+-----+----+---+-------+------------+----------+-------------+------------+----------------+---------------+--------------+-------------+------------+----------+
|date_string|      date|year|quarter|month|week|day|weekday|weekday_name|month_name|first_of_year|last_of_year|first_of_quarter|last_of_quarter|first_of_month|last_of_month|is_leap_year|is_weekend|
+-----------+----------+----+-------+-----+----+---+-------+------------+----------+-------------+------------+----------------+---------------+--------------+-------------+------------+----------+
|   20210315|2021-03-15|2021|      1|    3|  11| 15|      2|      Monday|     March|   2021-01-01|  2021-12-31|      2021-01-01|     2021-03-31|    2021-03-01|   2021-03-31|       false|     false|
|   20210331|2021-03-31|2021|      1|    3|  13| 31|      4|   Wednesday|     March|   2021-01-01|  2021-12-31|      2021-01-01|     2021-03-31|    2021-03-01|   2021-03-31|       false|     false|
|   202102

                                                                                

### Fact Payment

In [16]:
%%file sql/fact_payment.sql

SELECT 
    payment_id,
    to_char(date, 'yyyyMMdd') as payment_date,
    rider_id,
    amount
FROM Bronze_Payments

Overwriting sql/fact_payment.sql


In [17]:
with open('sql/fact_payment.sql', 'r') as f:
    gold_fact_payment = spark.sql(f.read())

gold_fact_payment.show(5)
gold_fact_payment.printSchema()

+----------+------------+--------+------+
|payment_id|payment_date|rider_id|amount|
+----------+------------+--------+------+
|         1|    20190501|    1000|   9.0|
|         2|    20190601|    1000|   9.0|
|         3|    20190701|    1000|   9.0|
|         4|    20190801|    1000|   9.0|
|         5|    20190901|    1000|   9.0|
+----------+------------+--------+------+
only showing top 5 rows
root
 |-- payment_id: integer (nullable = true)
 |-- payment_date: string (nullable = true)
 |-- rider_id: integer (nullable = true)
 |-- amount: double (nullable = true)



### Fact Trip

In [18]:
%%file sql/fact_trip.sql

SELECT
    bt.trip_id,
    br.rider_id,
    bt.start_station_id,
    bt.end_station_id,
    date_format(bt.start_at, 'yyyyMMdd') AS start_date,
    date_format(bt.ended_at, 'yyyyMMdd') AS end_date,
    date_format(bt.start_at, 'HHmmss') AS start_time,
    date_format(bt.ended_at, 'HHmmss') AS end_time,
    bt.rideable_type,
    (unix_timestamp(bt.ended_at) - unix_timestamp(bt.start_at)) AS trip_duration_seconds,
    floor(months_between(bt.ended_at, br.birthday) / 12) AS rider_age
FROM Bronze_Trips bt
JOIN Bronze_Riders br
    ON bt.rider_id = br.rider_id


Overwriting sql/fact_trip.sql


In [22]:
with open('sql/fact_trip.sql', 'r') as f:
    gold_fact_trip = spark.sql(f.read())

gold_fact_trip.show(5)
gold_fact_trip.printSchema()

+----------------+--------+----------------+--------------+----------+--------+----------+--------+-------------+---------------------+---------+
|         trip_id|rider_id|start_station_id|end_station_id|start_date|end_date|start_time|end_time|rideable_type|trip_duration_seconds|rider_age|
+----------------+--------+----------------+--------------+----------+--------+----------+--------+-------------+---------------------+---------+
|89E7AA6C29227EFF|   71934|             525|           660|  20210212|20210212|    161456|  162143| classic_bike|                  407|       37|
|0FEFDE2603568365|   47854|             525|         16806|  20210214|20210214|    175238|  181209| classic_bike|                 1171|       38|
|E6159D746B2DBB91|   70870|    KA1503000012|  TA1305000029|  20210209|20210209|    191018|  191910|electric_bike|                  532|       33|
|B32D3199F1C2E75B|   58974|             637|  TA1305000034|  20210202|20210202|    174941|  175406| classic_bike|           

In [23]:
gold_dim_station.write.mode("overwrite").format('csv').save('datalake/gold_dim_station')
gold_dim_rider.write.mode("overwrite").format('csv').save('datalake/gold_dim_rider')
gold_dim_time.write.mode("overwrite").format('csv').save('datalake/gold_dim_time')
gold_dim_date.write.mode("overwrite").format('csv').save('datalake/gold_dim_date')
gold_fact_payment.write.mode("overwrite").format('csv').save('datalake/gold_fact_payment')
gold_fact_trip.write.mode("overwrite").format('csv').save('datalake/gold_fact_trip')

                                                                                