# ETL with Spark (Local)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType

import pyspark.sql.functions as F

In [14]:
data = "github_events_01.json"

In [3]:
spark = SparkSession.builder \
    .appName("ETL") \
    .getOrCreate()

In [4]:
data = spark.read.option("multiline", "true").json(data)

In [5]:
data.printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- author_association: string (nullable = true)
 |    |    |-- body: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- html_url: string (nullable = true)
 |    |    |-- id: long (nullable =

In [6]:
data.createOrReplaceTempView("staging_events")

In [10]:
table = spark.sql("""
    select
        *
        
    from
        staging_events
""").show()

+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+
|               actor|          created_at|         id|                 org|             payload|public|                repo|             type|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+
|{https://avatars....|2022-08-17T15:51:05Z|23487929637|{https://avatars....|{created, {COLLAB...|  true|{75340147, 350org...|IssueCommentEvent|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+



In [11]:
table = spark.sql("""
    select
        id
        , type
        , created_at
        , to_date(created_at) as date
        , year(created_at) as year
        
    from
        staging_events
""")

In [12]:
table.show()

+-----------+-----------------+--------------------+----------+----+
|         id|             type|          created_at|      date|year|
+-----------+-----------------+--------------------+----------+----+
|23487929637|IssueCommentEvent|2022-08-17T15:51:05Z|2022-08-17|2022|
+-----------+-----------------+--------------------+----------+----+



In [17]:
output_csv = "output_csv"
output_parquet = "output_parquet"

In [18]:
table.write.partitionBy("year").mode("overwrite").csv(output_csv)

In [19]:
table.write.partitionBy("year").mode("overwrite").parquet(output_parquet)