In [0]:
file_path = "/databricks-datasets/nyctaxi/tables/nyctaxi_yellow"

# delta 포맷으로 된 데이터를 읽자
taxi_df = spark.read.format("delta").load(file_path)

# df.show() 대신 display() 씀
display(taxi_df.limit(7))

In [0]:
from pyspark.sql.functions import avg, col

taxi2_df = taxi_df.groupBy("payment_type").agg(
    avg("fare_amount").alias("Avg_Fare_Amount")
).orderBy(col("Avg_Fare_Amount").desc())

display(taxi2_df)

"""
I can see there are wrong data in "payment_type" after above query.
The data shifted. The schema broke. :(
"""

In [0]:
# Let's try with another dataset. more explicit one.

raw_df = spark.read.csv("/databricks-datasets/nyctaxi/sample/json/2015_T1.json", multiLine=True)

df = spark.read.table("samples.nyctaxi.trips")

display(df.limit(10))

In [0]:
from pyspark.sql.functions import avg, col, round

# trip_distance를 반올림해서 Distance_Group 이라는 새 열을 추가
# .withColumn("생성하거나 수정할 컬럼 이름", 존재하는 컬럼 객체)
distance_analysis = df.withColumn("Distance_Group", round(col("trip_distance"), 0)) \
    .groupBy("Distance_Group") \
    .agg(round(avg("fare_amount"), 2).alias("Avg_Fare")) \
    .orderBy("Distance_Group")

display(distance_analysis)

In [0]:
from pyspark.sql.functions import when

# 조건문 사용해보기
df_with_type = df.withColumn(
    "Trip_Type", 
    when(col("trip_distance") <= 2, "Short")
    .when(col("trip_distance") <= 10, "Medium")
    .otherwise("Long")
)

display(df_with_type.limit(10))

In [0]:
# SQL 로 함 해보기
df.createOrReplaceTempView("trips_table")

sql_result = spark.sql("""
    SELECT 
        CASE 
            WHEN trip_distance <= 2 THEN 'Short'
            WHEN trip_distance <= 10 THEN 'Medium'
            ELSE 'Long'
        END AS Trip_Type,
        ROUND(AVG(fare_amount), 2) AS Avg_Fare
    FROM trips_table
    GROUP BY 1
    ORDER BY Avg_Fare DESC
""")

display(sql_result)