In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.appName('sub-query1')
                             .config("spark.driver.host","127.0.0.1") 
                             .config("spark.driver.bindAddress","127.0.0.1")
                             .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/09 18:07:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Create a DataFrame
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
df = spark.createDataFrame(data, ["name", "age"])

# Register the DataFrame as a temporary view
df.createOrReplaceTempView("people")


## CTE(공통 테이블 표현식) 

- with 구문으로 서브쿼리를 공통으로 표시
- 본 쿼리에서 이를 임시테이블 처럼 사용 

In [3]:
spark.sql("SELECT AVG(age) AS average_age FROM people").show()

[Stage 0:>                                                        (0 + 10) / 10]

+-----------+
|average_age|
+-----------+
|       30.0|
+-----------+



                                                                                

In [6]:
# Use CTE to process a subquery
result = spark.sql("""
    WITH avg_age AS (
        SELECT AVG(age) AS average_age FROM people
    )
    SELECT name FROM people WHERE age > (SELECT average_age FROM avg_age)
""")

# Show the result
result.show()

+-------+
|   name|
+-------+
|Charlie|
+-------+



## select 절에 서브쿼리

In [11]:
# Use a subquery in the SELECT clause
result3 = spark.sql("""
    SELECT name, age, (SELECT AVG(age) FROM people) AS avg_age
    FROM people
""")

# Show the result
result3.show()

+-------+---+-------+
|   name|age|avg_age|
+-------+---+-------+
|  Alice| 25|   30.0|
|    Bob| 30|   30.0|
|Charlie| 35|   30.0|
+-------+---+-------+



## from 절에 서브쿼리 

In [10]:
# Use a subquery in the FROM clause
result2 = spark.sql("""
    SELECT subquery.name, subquery.age
    FROM (SELECT name, age FROM people WHERE age > 30) AS subquery
""")

# Show the result
result2.show()

+-------+---+
|   name|age|
+-------+---+
|Charlie| 35|
+-------+---+



## where 절 서브쿼리 
- 서브쿼리가 하나의 값인지은 비교 등으로 처리
- 서브쿼리의 결과가 여러 개의 값일 경우는 in 연산자로 처리 

In [8]:
spark.sql("SELECT AVG(age) FROM people").show()

+--------+
|avg(age)|
+--------+
|    30.0|
+--------+



In [7]:
# Use a subquery to filter the data
result1 = spark.sql("SELECT name FROM people WHERE age > (SELECT AVG(age) FROM people)")

# Show the result
result1.show()

+-------+
|   name|
+-------+
|Charlie|
+-------+



In [None]:
# Stop the SparkSession
spark.stop()