# ETL with Spark (Local)

In [1]:
from pyspark.sql import SparkSession
# from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType

# import pyspark.sql.functions as F

In [2]:
import pandas as pd
import glob

In [3]:
p = glob.glob("data/*.json")

In [4]:
p

['data/github_events_01.json', 'data/github_events_02.json']

In [None]:
data = "github_events_01.json"

In [None]:
data_2 = "github_events_02.json"

In [5]:
spark = SparkSession.builder \
    .appName("ETL") \
    .getOrCreate()

In [6]:
data_folder = "data"

In [7]:
data = spark.read.option("multiline", "true").json(data_folder)

In [None]:
# data = spark.read.option("multiline", "true").json(data_2)

In [8]:
data.show()

+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+--------------------+
|               actor|          created_at|         id|                 org|             payload|public|                repo|                type|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+--------------------+
|{https://avatars....|2022-08-17T15:52:40Z|23487963576|{https://avatars....|{started, NULL, N...|  true|{6296790, spring-...|          WatchEvent|
|{https://avatars....|2022-08-17T15:52:40Z|23487963624|                NULL|{NULL, NULL, NULL...|  true|{525860969, gurra...|         CreateEvent|
|{https://avatars....|2022-08-17T15:52:40Z|23487963529|                NULL|{NULL, e80c84c7bb...|  true|{350706029, afbel...|           PushEvent|
|{https://avatars....|2022-08-17T15:52:40Z|23487963558|{https://avatars....|{created, NULL, {...|  true|{226399669, CM

In [None]:
data.printSchema()

In [9]:
data.select("id", "type").show()

+-----------+--------------------+
|         id|                type|
+-----------+--------------------+
|23487963576|          WatchEvent|
|23487963624|         CreateEvent|
|23487963529|           PushEvent|
|23487963558|   IssueCommentEvent|
|23487963581|    PullRequestEvent|
|23487963532|           PushEvent|
|23487963524|           PushEvent|
|23487963526|           PushEvent|
|23487963492|           PushEvent|
|23487963504|         DeleteEvent|
|23487963536|PullRequestReview...|
|23487963495|         CreateEvent|
|23487963522|           PushEvent|
|23487963444|           PushEvent|
|23487963462|    PullRequestEvent|
|23487963480|         IssuesEvent|
|23487963457|           PushEvent|
|23487963413|           PushEvent|
|23487963429|           PushEvent|
|23487963448|           PushEvent|
+-----------+--------------------+
only showing top 20 rows



In [10]:
data.createOrReplaceTempView("staging_events")

In [11]:
table = spark.sql("""
    select
        *
        
    from
        staging_events
""").show()

+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+--------------------+
|               actor|          created_at|         id|                 org|             payload|public|                repo|                type|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+--------------------+
|{https://avatars....|2022-08-17T15:52:40Z|23487963576|{https://avatars....|{started, NULL, N...|  true|{6296790, spring-...|          WatchEvent|
|{https://avatars....|2022-08-17T15:52:40Z|23487963624|                NULL|{NULL, NULL, NULL...|  true|{525860969, gurra...|         CreateEvent|
|{https://avatars....|2022-08-17T15:52:40Z|23487963529|                NULL|{NULL, e80c84c7bb...|  true|{350706029, afbel...|           PushEvent|
|{https://avatars....|2022-08-17T15:52:40Z|23487963558|{https://avatars....|{created, NULL, {...|  true|{226399669, CM

In [12]:
table = spark.sql("""
    select
        id
        , type
        , created_at
        , to_date(created_at) as date
        , year(created_at) as year
        , actor.login
        , actor.url as actor_url
        , repo.name
        , repo.url as repo_url
        
    from
        staging_events
""")

In [13]:
table.show()

+-----------+--------------------+--------------------+----------+----+--------------+--------------------+--------------------+--------------------+
|         id|                type|          created_at|      date|year|         login|           actor_url|                name|            repo_url|
+-----------+--------------------+--------------------+----------+----+--------------+--------------------+--------------------+--------------------+
|23487963576|          WatchEvent|2022-08-17T15:52:40Z|2022-08-17|2022|    evilgaoshu|https://api.githu...|spring-projects/s...|https://api.githu...|
|23487963624|         CreateEvent|2022-08-17T15:52:40Z|2022-08-17|2022|      gurram47|https://api.githu...|gurram47/AP201100...|https://api.githu...|
|23487963529|           PushEvent|2022-08-17T15:52:40Z|2022-08-17|2022|    afbeltranr|https://api.githu...| afbeltranr/Agrilab2|https://api.githu...|
|23487963558|   IssueCommentEvent|2022-08-17T15:52:40Z|2022-08-17|2022|      karla-vm|https://api.gi

In [14]:
output_csv = "output_csv"
output_parquet = "output_parquet"

In [15]:
table.write.partitionBy("year").mode("overwrite").csv(output_csv)

In [16]:
table.write.partitionBy("year").mode("overwrite").parquet(output_parquet)

In [17]:
table = spark.sql("""
    select
        id
        , type
        , created_at
        , day(created_at) as day
        , month(created_at) as month
        , year(created_at) as year
        , date(created_at) as date
    from
        staging_events
""")

In [None]:
table.show()

In [18]:
destination = "events"

In [19]:
table.write.partitionBy("year", "month", "day").mode("overwrite").csv(destination)

In [None]:
table.write.partitionBy("date").mode("overwrite").csv(destination)

In [None]:
table = spark.sql("""
    select
        actor.login
        , id as event_id
        , actor.url as actor_url
    from
        staging_events
""")
destination = "actors"
table.write.mode("overwrite").csv(destination)

In [None]:
table = spark.sql("""
    select
        repo.name
        , id as event_id
        , repo.url as repo_url
        
    from
        staging_events
""")
destination = "repos"
table.write.mode("overwrite").csv(destination)