### Read csv file using Apache Spark 

In [1]:
from pyspark.sql import SparkSession

In [None]:
# Create spark session
spark = SparkSession.builder.getOrCreate()

In [None]:
# Python version check
# !which python
!/opt/conda/bin/python --version

In [None]:
# Spark version check
spark

In [None]:
# read csv file (no options: default column names & default data type(String))
log_access = spark.read.csv("data/log_access.csv")
log_access.printSchema()
log_access.show()

In [None]:
# read.option("head", "true") : get column names
log_access = spark.read.option("header", "true").csv("data/log_access.csv")
log_access.printSchema()
log_access.show()

In [None]:
# read.option("inferSchema", "true") : spark infers the types of data
log_access = spark.read.option("header", "true").option("inferSchema", "true").csv("data/log_access.csv")
log_access.printSchema()
log_access.show()

### Comparison of 2 method types of spark 

#### 1) calling structured API

from pyspark.sql.functions import unix_timestamp, from_unixtime, to_timestamp, to_date, col, lit

df = spark.read.option("inferSchema", "true").json("data/activity-data")

timestamp = df.select(
    "Arrival_Time",
    to_timestamp(from_unixtime(col('Arrival_Time') / lit(1000)), 'yyyy-MM-dd HH:mm:ss').alias('String_Datetime'),
    to_date(from_unixtime(col('Arrival_Time') / lit(1000)), 'yyyy-MM-dd').alias('String_Date')
)
timestamp.show(5)
# lit() : function to add constant column

#### 2) print using SQL expression

In [None]:
# Using SQL expression
ts = df.selectExpr(
    "Arrival_Time",
    "to_timestamp(from_unixtime(Arrival_Time / 1000), 'yyyy-MM-dd HH:mm:ss') as String_Datetime",
    "to_date(from_unixtime(Arrival_Time / 1000), 'yyyy-MM-dd') as String_Date"
)
ts.show(5)

In [None]:
df.filter(col("index") > 100).select("index", "user").groupBy("user").count().show() 
df.filter("index > 100").select("index", "user").groupBy("user").count().show()

### Read json file using Apache Spark

In [None]:
# read.json
json = spark.read.json("data/activity-data/part-00000-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json")
users = json.filter("index > 100").select("index", "user").groupBy("user").count()
users.show(5)

### Create & lookup view table

In [None]:
# Create temporary view table that can lookup data in only current session
users.createOrReplaceTempView("users")
spark.sql("select * from users where count is not null and count > 9000 order by count desc").show(5)

In [None]:
# 3 types of reading json file

df = spark.read.format("json").load("./data/flight-data/json/2015-summary.json")
df.printSchema()

df2 = spark.read.load("./data/flight-data/json/2015-summary.json", format="json")
df2.printSchema()

df3 = spark.read.json("./data/flight-data/json/2015-summary.json")
df3.printSchema()

### Practice

In [None]:
# Ex 1
df1 = spark.read.csv("data/tbl_user.csv")
df1.printSchema()
df1.show()

In [None]:
# Ex 2
df2 = spark.read.option("header", "true").csv("data/tbl_purchase.csv")
df2.show(5)
df2_new = df2.select(
    to_date(from_unixtime(col('p_time') / 1000), 'yyyy-MM-dd').alias("p_time"),
    "p_uid", "p_id", "p_name", "p_amount"
)
df2_new.show(5)