In [84]:
spark.stop()

In [85]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("spark_sql_basic2")
sc   = SparkContext(conf=conf)

In [86]:
# RDD만을 이용한 데이터 추출

In [87]:

movies_rdd = sc.parallelize([
    (1, ("어벤져스", "마블")),
    (2, ("슈퍼맨", "DC")),
    (3, ("배트맨", "DC")),
    (4, ("겨울왕국", "디즈니")),
    (5, ("아이언맨", "마블"))
])


attendances_rdd = sc.parallelize([
    (1, (13934592, "KR")),
    (2, (2182227,"KR")),
    (3, (4226242, "KR")),
    (4, (10303058, "KR")),
    (5, (4300365, "KR"))
])

In [88]:
# 마블 영화 중 관객 수가 500만 이상인 영화를 가져오기

In [89]:
# CASE1. join 먼저, filter 나중에
movie_att = movies_rdd.join(attendances_rdd)
movie_att.filter(
    lambda x : x[1][0][1] == "마블" and x[1][1][0] > 5000000
).collect()

[(1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [90]:
# CASE 2. filter 먼저, join 나중에
filtered_movies = movies_rdd.filter(lambda x : x[1][1] == '마블')
filtered_att = attendances_rdd.filter(lambda x : x[1][0] > 5000000)

filtered_movies.join(filtered_att).collect()

[(1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [91]:
# Spark SQL 사용해 보기

In [92]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [93]:
# 컬럼 추가
movies = [
    (1, "어벤져스", "마블", 2012, 4, 26),
    (2, "슈퍼맨", "DC", 2013, 6, 13),
    (3, "배트맨", "DC", 2008, 8, 6),
    (4, "겨울왕국", "디즈니", 2014, 1, 16),
    (5, "아이언맨", "마블", 2008, 4, 30)
]

In [94]:
#스키마를 알아야 한다.
movie_schema = ["id", "name", "company", "year", "month", "day"]

In [95]:
# 2. 데이터 프레임 만들기

In [96]:
df = spark.createDataFrame(data=movies, schema=movie_schema)

In [97]:
df.select("name").show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [98]:
df.filter(df.year >= 2010).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  4|겨울왕국| 디즈니|2014|    1| 16|
+---+--------+-------+----+-----+---+



In [99]:
df.createOrReplaceTempView("movies")

In [100]:
# 영화 이름만 가져오기

query = """

SELECT name
  FROM movies

"""
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [101]:
df.show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [102]:
# 2010년 이후에 개봉한 영화를 조회

query = """
SELECT name, year
FROM movies
WHERE year > 2010
"""
spark.sql(query).show()

+--------+----+
|    name|year|
+--------+----+
|어벤져스|2012|
|  슈퍼맨|2013|
|겨울왕국|2014|
+--------+----+



In [103]:
# 2012년도 이전에 개봉한 영화의 이름과 회사를 출력
query = """
SELECT name, company
FROM movies
WHERE year <= 2012
"""
spark.sql(query).show()

+--------+-------+
|    name|company|
+--------+-------+
|어벤져스|   마블|
|  배트맨|     DC|
|아이언맨|   마블|
+--------+-------+



In [104]:
# like 문자열 데이터에서 특정 단어나 문장을 포함한 데이터를 찾을 때
# % 기호를 사용해서 문장이 매칭되는지 확인 가능!
# 제목이 ~~맨으로 끝나는 데이터의 모든 정보를 조회
query = """
SELECT *
FROM movies
WHERE name LIKE '%맨'
"""
spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [105]:

# BETWEEN 특정 데이터와 데이터 사이를 조회

# 개봉 월이 4 ~ 8월 사이. 4 <= 개봉월 <= 8


In [106]:
query = """
SELECT *
FROM movies
WHERE month BETWEEN 4 AND 8
"""
spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [None]:
# Join 구현하기

In [107]:

attendances = [
    (1, 13934592., "KR"),
    (2, 2182227.,"KR"),
    (3, 4226242., "KR"),
    (4, 10303058., "KR"),
    (5, 4300365., "KR")
]

In [108]:
# 직접 스키마 지정해 보기
from pyspark.sql.types import StringType, FloatType\
    , IntegerType\
    , StructType, StructField

In [109]:
att_schema = StructType([ # 모든 컬럼의 타입을 통칭 - 컬럼 데이터의 집합
    StructField("id", IntegerType(), True), # StructField : 컬럼
    StructField("att", FloatType(), True),
    StructField("theater_country", StringType(), True)
])

In [110]:

att_df = spark.createDataFrame(
    data=attendances,
    schema=att_schema
)

att_df.dtypes

[('id', 'int'), ('att', 'float'), ('theater_country', 'string')]

In [111]:
att_df.createOrReplaceTempView("att")

In [112]:
att_df.show()

+---+-----------+---------------+
| id|        att|theater_country|
+---+-----------+---------------+
|  1|1.3934592E7|             KR|
|  2|  2182227.0|             KR|
|  3|  4226242.0|             KR|
|  4|1.0303058E7|             KR|
|  5|  4300365.0|             KR|
+---+-----------+---------------+



In [114]:
# df와att_df join
query = """
SELECT 
    m.id, m.name, m.company, a.att
FROM movies m
JOIN att a
ON m.id = a.id
"""

spark.sql(query).show()

+---+--------+-------+-----------+
| id|    name|company|        att|
+---+--------+-------+-----------+
|  1|어벤져스|   마블|1.3934592E7|
|  2|  슈퍼맨|     DC|  2182227.0|
|  3|  배트맨|     DC|  4226242.0|
|  4|겨울왕국| 디즈니|1.0303058E7|
|  5|아이언맨|   마블|  4300365.0|
+---+--------+-------+-----------+



In [None]:
# 데이터 프레임 API

In [20]:
# select
df.select("*").collect()

[Row(id=1, name='어벤져스', company='마블', year=2012, month=4, day=26),
 Row(id=2, name='슈퍼맨', company='DC', year=2013, month=6, day=13),
 Row(id=3, name='배트맨', company='DC', year=2008, month=8, day=6),
 Row(id=4, name='겨울왕국', company='디즈니', year=2014, month=1, day=16),
 Row(id=5, name='아이언맨', company='마블', year=2008, month=4, day=30)]

In [21]:
df.select("name", "company").collect()

[Row(name='어벤져스', company='마블'),
 Row(name='슈퍼맨', company='DC'),
 Row(name='배트맨', company='DC'),
 Row(name='겨울왕국', company='디즈니'),
 Row(name='아이언맨', company='마블')]

In [22]:
df.select(df.name, (df.year-2000).alias("year")).show()

+--------+----+
|    name|year|
+--------+----+
|어벤져스|  12|
|  슈퍼맨|  13|
|  배트맨|   8|
|겨울왕국|  14|
|아이언맨|   8|
+--------+----+



In [23]:
# agg : Aggreagte의 약자로써, 그룹핑 후 데이터를 하나로 합쳐주는 역할
df.agg({"id": "count"}).collect()

[Row(count(id)=5)]

In [24]:
from pyspark.sql import functions as F
df.agg(F.min(df.year)).collect()

[Row(min(year)=2008)]

In [25]:
df.groupBy().avg().collect()

[Row(avg(id)=3.0, avg(year)=2011.0, avg(month)=4.6, avg(day)=18.2)]

In [26]:
# 회사별 개봉월의 평균
df.groupBy('company').agg({"month": "mean"}).collect()

[Row(company='디즈니', avg(month)=1.0),
 Row(company='마블', avg(month)=4.0),
 Row(company='DC', avg(month)=7.0)]

In [None]:
# 회사 별 월 별 영화 개수 정보


In [27]:
# join : 다른 데이터 프레임과 사용자가 지정한 컬럼을 기준으로 합치는 작업
df.join(att_df, 'id').select(df.name, att_df.att).show()

+--------+-----------+
|    name|        att|
+--------+-----------+
|어벤져스|1.3934592E7|
|  슈퍼맨|  2182227.0|
|  배트맨|  4226242.0|
|겨울왕국|1.0303058E7|
|아이언맨|  4300365.0|
+--------+-----------+



In [28]:
# select, where, orderBy 절 사용
marvel_df = df.select("name", "company", "year").where("company=='마블'").orderBy("id")
marvel_df.collect()

[Row(name='어벤져스', company='마블', year=2012),
 Row(name='아이언맨', company='마블', year=2008)]

In [127]:
spark.stop()
sc.stop()

# fhvhv_tripdata

In [None]:
# SQL 최적화

In [144]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("trip_count_sql").getOrCreate()

In [145]:
trip_file = "learning_spark_data/fhvhv_tripdata_2020-03.csv"

In [146]:
# inferSchema : 자동으로 스키마 예측하게 하기
data = spark.read.csv(trip_file, inferSchema=True, header=True)

In [147]:
data.createOrReplaceTempView("mobility_data")

In [149]:
query = """
select *
from mobility_data
limit 5
"""
spark.sql(query).show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   NULL|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   NULL|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   NULL|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   NULL|
+-----------------+--------------------+-------------------+-------------------+

# 스파크 SQL을 사용하는 이유

In [150]:
query = """

select split(pickup_datetime, ' ')[0] as pickup_date, count(*) as trips
from mobility_data

group by pickup_date
"""

spark.sql(query).show()

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-06|872012|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-16|391518|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-26|141607|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
+-----------+------+
only showing top 20 rows



In [151]:
# 실행 계획 살펴보기
spark.sql(query).explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['split('pickup_datetime,  )[0] AS pickup_date#1154, 'count(1) AS trips#1155]
+- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [split(cast(pickup_datetime#1038 as string),  , -1)[0]], [split(cast(pickup_datetime#1038 as string),  , -1)[0] AS pickup_date#1154, count(1) AS trips#1155L]
+- SubqueryAlias mobility_data
   +- View (`mobility_data`, [hvfhs_license_num#1036,dispatching_base_num#1037,pickup_datetime#1038,dropoff_datetime#1039,PULocationID#1040,DOLocationID#1041,SR_Flag#1042])
      +- Relation [hvfhs_license_num#1036,dispatching_base_num#1037,pickup_datetime#1038,dropoff_datetime#1039,PULocationID#1040,DOLocationID#1041,SR_Flag#1042] csv

== Optimized Logical Plan ==
Aggregate [_groupingexpression#1159], [_groupingexpression#1159 AS pickup_date#1154, count(1) AS trips#1155L]
+- Project [split(cast(pickup_datetime#1038 as string),  , -1)[0] AS _grouping

In [152]:
# 두번째 쿼리
spark.sql("""select 
                pickup_date, 
                count(*) as trips
             from ( select
                          split(pickup_datetime, ' ')[0] as pickup_date
                          from mobility_data )
             group by pickup_date""").explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['pickup_date, 'count(1) AS trips#1163]
+- 'SubqueryAlias __auto_generated_subquery_name
   +- 'Project ['split('pickup_datetime,  )[0] AS pickup_date#1162]
      +- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [pickup_date#1162], [pickup_date#1162, count(1) AS trips#1163L]
+- SubqueryAlias __auto_generated_subquery_name
   +- Project [split(cast(pickup_datetime#1038 as string),  , -1)[0] AS pickup_date#1162]
      +- SubqueryAlias mobility_data
         +- View (`mobility_data`, [hvfhs_license_num#1036,dispatching_base_num#1037,pickup_datetime#1038,dropoff_datetime#1039,PULocationID#1040,DOLocationID#1041,SR_Flag#1042])
            +- Relation [hvfhs_license_num#1036,dispatching_base_num#1037,pickup_datetime#1038,dropoff_datetime#1039,PULocationID#1040,DOLocationID#1041,SR_Flag#1042] csv

== Optimized Logical Plan ==
Aggregate [pickup_date#1162], [pickup_dat

| 비교 항목         | 첫 번째 방식        | 두 번째 방식                        |
| ------------- | -------------- | ------------------------------ |
| 코드 길이         | 짧고 간단함         | 구조적으로 명확함                      |
| 중첩 쿼리         | 없음             | 있음 (가독성 및 재사용 ↑)               |
| Spark Plan 관점 | 내부적으로는 거의 동일   | 동일 (결과와 성능 차이 거의 없음)           |
| 추천 상황         | 빠른 실습, 간단한 분석용 | 복잡한 파생 컬럼 처리 시, 명확한 로직 표현 필요 시 |


# spark 사용이유
| 이유                         | 설명                                           |
| -------------------------- | -------------------------------------------- |
| **쿼리 작성 방식이 달라도 성능은 동일** | Catalyst가 의미적으로 동일한 쿼리를 같은 방식으로 실행           |
| **최적화는 Spark가 자동으로**     | 사용자는 로직에 집중, Spark는 내부에서 최적 실행 계획 생성         |
| **생산성과 성능을 모두 보장**       | SQL, DataFrame 등 다양한 인터페이스를 통해 효율적 데이터 분석 가능 |


# 결론
위 두 방식의 비교는 Spark Catalyst Optimizer가 의미적으로 같은 쿼리를 동일한 방식으로 최적화해서 실행한다는 강력한 증거이다.
따라서 Spark를 쓰면 "내가 쿼리를 복잡하게 최적화하지 않아도 Spark가 내부적으로 해준다"는 것을 확인할 수 있다.

In [156]:
spark.stop()

In [None]:
# 연습 eda

In [162]:
trip_file = "learning_spark_data/fhvhv_tripdata_2020-03.csv"
zone_file = "learning_spark_data/taxi+_zone_lookup.csv"

In [158]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("trip_count_sql").getOrCreate()

In [163]:
#운행 데이터 프레임 생성, Zone 데이터프레임 생성
trip_data = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('learning_spark_data/fhvhv_tripdata_2020-03.csv')
zone_data = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('learning_spark_data/taxi+_zone_lookup.csv')

In [164]:
trip_data.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: integer (nullable = true)



In [165]:
zone_data.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [166]:
trip_data.show(5)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   NULL|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   NULL|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   NULL|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   NULL|
+-----------------+--------------------+-------------------+-------------------+

| 컬럼명                    | 한글 설명               | 예시                  |
| ---------------------- | ------------------- | ------------------- |
| `hvfhs_license_num`    | 운송사업자 코드            | HV0003, HV0005 등    |
| `dispatching_base_num` | 배차된 기지 번호           | B02764 등            |
| `pickup_datetime`      | 승차 시각               | 2020-03-01 00:03:40 |
| `dropoff_datetime`     | 하차 시각               | 2020-03-01 00:23:39 |
| `PULocationID`         | 승차 지점의 Location ID  | 81, 168 등           |
| `DOLocationID`         | 하차 지점의 Location ID  | 159, 119 등          |
| `SR_Flag`              | 공유 승차 여부 (값이 없거나 1) | NULL 또는 1           |


In [167]:
zone_data.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



| 컬럼명            | 한글 설명             | 예시                       |
| -------------- | ----------------- | ------------------------ |
| `LocationID`   | 지역 고유 ID          | 1, 2, 3 등                |
| `Borough`      | 자치구(행정구역)         | Manhattan, Bronx 등       |
| `Zone`         | 세부 지역명            | Alphabet City 등          |
| `service_zone` | 서비스 존 (택시 서비스 구분) | Yellow Zone, Boro Zone 등 |


In [168]:
trip_data.createOrReplaceTempView("trip_data")
zone_data.createOrReplaceTempView("zone_data")

In [None]:
## 승차 Location(PULocationID)별 개수 세기
# 하차 Location(DOLocationID)별 개수 세기
#HV0003 운송사업자의 승차 지역별 트립 건수를 집계하고, 
#가장 많은 운송사업자순으로 정렬하는 분석 쿼리  hvfhs_license_num
#운송사별 운행 건수 비교
#승차 위치 Borough별 운행 건수
#서비스 존별 승차/하차 건수

In [169]:
## 승차 Location(PULocationID)별 개수 세기
result = spark.sql("""
    SELECT PULocationID, COUNT(*) AS trip_count
    FROM trip_data
    GROUP BY PULocationID
    ORDER BY trip_count DESC
""")

result.show()

+------------+----------+
|PULocationID|trip_count|
+------------+----------+
|          61|    222094|
|          79|    183821|
|          76|    168311|
|         132|    163734|
|         138|    155876|
|          37|    155388|
|          42|    143389|
|         231|    135712|
|         234|    132693|
|         161|    128751|
|         244|    126621|
|          17|    126228|
|           7|    125458|
|          89|    124289|
|         225|    124001|
|         170|    122731|
|         230|    121628|
|         181|    121576|
|          48|    120732|
|          39|    116669|
+------------+----------+
only showing top 20 rows



In [170]:
# 하차 Location(DOLocationID)별 개수 세기
result = spark.sql("""
    SELECT DOLocationID, COUNT(*) AS trip_count
    FROM trip_data
    GROUP BY DOLocationID
    ORDER BY trip_count DESC
""")

result.show()


+------------+----------+
|DOLocationID|trip_count|
+------------+----------+
|         265|    387758|
|          61|    224476|
|         132|    216213|
|         138|    194943|
|          76|    172649|
|          37|    158832|
|          79|    147732|
|          42|    136729|
|         225|    128945|
|          17|    128152|
|         244|    127797|
|          89|    126958|
|           7|    124255|
|         161|    121771|
|         234|    118345|
|         181|    116900|
|         231|    116345|
|          39|    113966|
|         188|    113280|
|         170|    109306|
+------------+----------+
only showing top 20 rows



In [174]:
#HV0003 운송사업자의 승차 지역별 트립 건수를 집계하고, 
#가장 많은 운송사업자순으로 정렬하는 분석 쿼리  hvfhs_license_num

result = spark.sql("""
    SELECT PULocationID, COUNT(*) AS trip_count
    FROM trip_data
    WHERE hvfhs_license_num = 'HV0003'
    GROUP BY PULocationID
    ORDER BY trip_count DESC
""")

result.show()

+------------+----------+
|PULocationID|trip_count|
+------------+----------+
|          61|    163091|
|          76|    134198|
|         132|    114179|
|          79|    112017|
|          37|    110150|
|          42|    108070|
|         138|    104119|
|         244|     97324|
|          89|     95724|
|          39|     94484|
|         231|     94155|
|           7|     92676|
|          17|     90352|
|         161|     90261|
|         225|     88749|
|         234|     88372|
|         230|     86870|
|         188|     84347|
|          35|     82764|
|         168|     82396|
+------------+----------+
only showing top 20 rows



In [176]:
# 운송사별 운행 건수 비교
result = spark.sql("""
    SELECT hvfhs_license_num, COUNT(*) AS trip_count
    FROM trip_data
    GROUP BY hvfhs_license_num
    ORDER BY trip_count DESC
""")
result.show()

+-----------------+----------+
|hvfhs_license_num|trip_count|
+-----------------+----------+
|           HV0003|   9836763|
|           HV0005|   3219535|
|           HV0004|    336606|
+-----------------+----------+



In [None]:
# 조인

In [178]:
joined_result = spark.sql("""
    SELECT 
        t.*, 
        z1.Borough AS PU_Borough, 
        z1.Zone AS PU_Zone, 
        z2.Borough AS DO_Borough, 
        z2.Zone AS DO_Zone
    FROM trip_data t
    LEFT JOIN zone_data z1 
        ON t.PULocationID = z1.LocationID
    LEFT JOIN zone_data z2 
        ON t.DOLocationID = z2.LocationID
""")
joined_result.show(5, truncate=False)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+----------+-------------------------+----------+-----------------+
|hvfhs_license_num|dispatching_base_num|pickup_datetime    |dropoff_datetime   |PULocationID|DOLocationID|SR_Flag|PU_Borough|PU_Zone                  |DO_Borough|DO_Zone          |
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+----------+-------------------------+----------+-----------------+
|HV0005           |B02510              |2020-03-01 00:03:40|2020-03-01 00:23:39|81          |159         |NULL   |Bronx     |Eastchester              |Bronx     |Melrose South    |
|HV0005           |B02510              |2020-03-01 00:28:05|2020-03-01 00:38:57|168         |119         |NULL   |Bronx     |Mott Haven/Port Morris   |Bronx     |Highbridge       |
|HV0003           |B02764              |2020-03-01 00:03:07|2020-03-01 00:15:04|137         |20

In [181]:
joined_result.createOrReplaceTempView("join_data")

In [182]:
# 승차 위치 Borough별 운행 건수
result = spark.sql("""
    SELECT PU_Borough, COUNT(*) AS PU_Borough_count
    FROM join_data
    GROUP BY PU_Borough
    ORDER BY PU_Borough_count DESC
""")
result.show()

+-------------+----------------+
|   PU_Borough|PU_Borough_count|
+-------------+----------------+
|    Manhattan|         4953140|
|     Brooklyn|         3735764|
|       Queens|         2437383|
|        Bronx|         2086592|
|Staten Island|          178818|
|      Unknown|             845|
|          EWR|             362|
+-------------+----------------+



In [183]:
# 서비스 존별 승차/하차 건수
#서비스 존별 승차건수

result = spark.sql("""
    SELECT PU_Zone, COUNT(*) AS PU_Zone_count
    FROM join_data
    GROUP BY PU_Zone
    ORDER BY PU_Zone_count DESC
""")
result.show()

+--------------------+-------------+
|             PU_Zone|PU_Zone_count|
+--------------------+-------------+
| Crown Heights North|       222094|
|        East Village|       183821|
|       East New York|       168311|
|         JFK Airport|       163734|
|   LaGuardia Airport|       155876|
|      Bushwick South|       155388|
|Central Harlem North|       143389|
|TriBeCa/Civic Center|       135712|
|            Union Sq|       132693|
|      Midtown Center|       128751|
|Washington Height...|       126621|
|             Bedford|       126228|
|             Astoria|       125458|
|Flatbush/Ditmas Park|       124289|
|  Stuyvesant Heights|       124001|
|         Murray Hill|       122731|
|Times Sq/Theatre ...|       121628|
|          Park Slope|       121576|
|        Clinton East|       120732|
|            Canarsie|       116669|
+--------------------+-------------+
only showing top 20 rows



In [184]:
# 서비스 존별 하차건수
result = spark.sql("""
    SELECT DO_Zone, COUNT(*) AS DO_Zone_count
    FROM join_data
    GROUP BY DO_Zone
    ORDER BY DO_Zone_count DESC
""")
result.show()

+--------------------+-------------+
|             DO_Zone|DO_Zone_count|
+--------------------+-------------+
|                  NA|       387758|
| Crown Heights North|       224476|
|         JFK Airport|       216213|
|   LaGuardia Airport|       194943|
|       East New York|       172649|
|      Bushwick South|       158832|
|        East Village|       147732|
|Central Harlem North|       136729|
|  Stuyvesant Heights|       128945|
|             Bedford|       128152|
|Washington Height...|       127797|
|Flatbush/Ditmas Park|       126958|
|             Astoria|       124255|
|      Midtown Center|       121771|
|            Union Sq|       118345|
|          Park Slope|       116900|
|TriBeCa/Civic Center|       116345|
|            Canarsie|       113966|
|Prospect-Lefferts...|       113280|
|         Murray Hill|       109306|
+--------------------+-------------+
only showing top 20 rows



In [185]:
spark.stop()