In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
findspark.find()

'/Users/a06411/opt/anaconda3/envs/fluentPython/lib/python3.11/site-packages/pyspark'

In [4]:
import sys

from pyspark.sql import SparkSession

In [5]:
spark = (SparkSession.builder.appName('ml-bank')
                             .config("spark.driver.host","127.0.0.1") 
                             .config("spark.driver.bindAddress","127.0.0.1")
                             .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/07 16:31:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# get the M&M data set file name
mnm_file = "../data/mnm_dataset.csv"

In [7]:
# read the file into a Spark DataFrame
mnm_df = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(mnm_file))

## 데이터프레임 스키마 처리

In [8]:
mnm_df.printSchema()

root
 |-- State: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Count: integer (nullable = true)



## 데이터프레임의 정보확인 

In [6]:
mnm_df.count()

99999

In [6]:
mnm_df.show(n=5, truncate=False)

+-----+------+-----+
|State|Color |Count|
+-----+------+-----+
|TX   |Red   |20   |
|NV   |Blue  |66   |
|CO   |Blue  |79   |
|OR   |Blue  |71   |
|WA   |Yellow|93   |
+-----+------+-----+
only showing top 5 rows



## 하이브 테이블과 뷰생성하기 

In [11]:
mnm_df.createOrReplaceTempView("mnmtable")

In [10]:
# Save DataFrame as Hive table
table_name = "my_mnmtable"
mnm_df.write.mode("overwrite").saveAsTable(table_name)

                                                                                

## 하이브 내의 메타 정보 확인

- 기본 테이블은 default 내에 저장된다
- 로컬이므로 현재 작성된 곳에 spark-warehouse 디렉토리에 데이터를 보관

In [12]:
spark.sql("Show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [13]:
spark.sql("Show views").show()

+---------+--------+-----------+
|namespace|viewName|isTemporary|
+---------+--------+-----------+
|         |mnmtable|       true|
+---------+--------+-----------+



In [14]:
spark.sql("Show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|  default|my_mnmtable|      false|
|         |   mnmtable|       true|
+---------+-----------+-----------+



## 1.  칼럼 선택하기 

## 1-1 모든 칼럼 

In [16]:
spark.sql("SELECT * FROM mnmtable limit 20").show()

+-----+------+-----+
|State| Color|Count|
+-----+------+-----+
|   TX|   Red|   20|
|   NV|  Blue|   66|
|   CO|  Blue|   79|
|   OR|  Blue|   71|
|   WA|Yellow|   93|
|   WY|  Blue|   16|
|   CA|Yellow|   53|
|   WA| Green|   60|
|   OR| Green|   71|
|   TX| Green|   68|
|   NV| Green|   59|
|   AZ| Brown|   95|
|   WA|Yellow|   20|
|   AZ|  Blue|   75|
|   OR| Brown|   72|
|   NV|   Red|   98|
|   WY|Orange|   45|
|   CO|  Blue|   52|
|   TX| Brown|   94|
|   CO|   Red|   82|
+-----+------+-----+



In [9]:
mnm_df.select("*").show()

+-----+------+-----+
|State| Color|Count|
+-----+------+-----+
|   TX|   Red|   20|
|   NV|  Blue|   66|
|   CO|  Blue|   79|
|   OR|  Blue|   71|
|   WA|Yellow|   93|
|   WY|  Blue|   16|
|   CA|Yellow|   53|
|   WA| Green|   60|
|   OR| Green|   71|
|   TX| Green|   68|
|   NV| Green|   59|
|   AZ| Brown|   95|
|   WA|Yellow|   20|
|   AZ|  Blue|   75|
|   OR| Brown|   72|
|   NV|   Red|   98|
|   WY|Orange|   45|
|   CO|  Blue|   52|
|   TX| Brown|   94|
|   CO|   Red|   82|
+-----+------+-----+
only showing top 20 rows



## 1-2 모든 행의 갯수 확인 

In [18]:
spark.sql("SELECT count(*) as sum_count FROM mnmtable").show()

+---------+
|sum_count|
+---------+
|    99999|
+---------+



In [13]:
mnm_df.selectExpr("count(*) as sum_count").show()

+---------+
|sum_count|
+---------+
|    99999|
+---------+



##  테이블의 스크마 확인하기 

In [19]:
spark.sql("DESC Table mnmtable").show()

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|   State|   string|   null|
|   Color|   string|   null|
|   Count|      int|   null|
+--------+---------+-------+



In [21]:
mnm_df.printSchema()

root
 |-- State: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Count: integer (nullable = true)



In [7]:
mnm_df.columns

['State', 'Color', 'Count']

In [22]:
spark.sql("Select State from mnmtable").show()

+-----+
|State|
+-----+
|   TX|
|   NV|
|   CO|
|   OR|
|   WA|
|   WY|
|   CA|
|   WA|
|   OR|
|   TX|
|   NV|
|   AZ|
|   WA|
|   AZ|
|   OR|
|   NV|
|   WY|
|   CO|
|   TX|
|   CO|
+-----+
only showing top 20 rows



In [10]:
mnm_df.select("State").show()

+-----+
|State|
+-----+
|   TX|
|   NV|
|   CO|
|   OR|
|   WA|
|   WY|
|   CA|
|   WA|
|   OR|
|   TX|
|   NV|
|   AZ|
|   WA|
|   AZ|
|   OR|
|   NV|
|   WY|
|   CO|
|   TX|
|   CO|
+-----+
only showing top 20 rows



In [26]:
query = """
select State, Color, sum(Count) 
from mnmtable
group by State, Color
order by sum(Count)"""

In [27]:
spark.sql(query).show()

+-----+------+----------+
|State| Color|sum(Count)|
+-----+------+----------+
|   WY| Brown|     86110|
|   WY|Yellow|     87800|
|   WY|Orange|     87956|
|   OR|Yellow|     88129|
|   UT| Green|     88392|
|   TX|  Blue|     88466|
|   UT| Brown|     88973|
|   CA|  Blue|     89123|
|   OR| Brown|     89136|
|   UT|Yellow|     89264|
|   NV|   Red|     89346|
|   CO|   Red|     89465|
|   OR| Green|     89578|
|   WA|  Blue|     89886|
|   AZ|  Blue|     89971|
|   UT|  Blue|     89977|
|   NV|  Blue|     90003|
|   AZ|   Red|     90042|
|   NM|  Blue|     90150|
|   OR|   Red|     90286|
+-----+------+----------+
only showing top 20 rows



In [7]:
# aggregate count of all colors and groupBy state and color
# orderBy descending order
count_mnm_df = (mnm_df.select("State", "Color", "Count")
                    .groupBy("State", "Color")
                    .sum("Count")
                    .orderBy("sum(Count)", ascending=False))


In [8]:
# show all the resulting aggregation for all the dates and colors
count_mnm_df.show(n=60, truncate=False)
print("Total Rows = %d" % (count_mnm_df.count()))

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|CA   |Yellow|100956    |
|WA   |Green |96486     |
|CA   |Brown |95762     |
|TX   |Green |95753     |
|TX   |Red   |95404     |
|CO   |Yellow|95038     |
|NM   |Red   |94699     |
|OR   |Orange|94514     |
|WY   |Green |94339     |
|NV   |Orange|93929     |
|TX   |Yellow|93819     |
|CO   |Green |93724     |
|CO   |Brown |93692     |
|CA   |Green |93505     |
|NM   |Brown |93447     |
|CO   |Blue  |93412     |
|WA   |Red   |93332     |
|WA   |Brown |93082     |
|WA   |Yellow|92920     |
|NM   |Yellow|92747     |
|NV   |Brown |92478     |
|TX   |Orange|92315     |
|AZ   |Brown |92287     |
|AZ   |Green |91882     |
|WY   |Red   |91768     |
|AZ   |Orange|91684     |
|CA   |Red   |91527     |
|WA   |Orange|91521     |
|NV   |Yellow|91390     |
|UT   |Orange|91341     |
|NV   |Green |91331     |
|NM   |Orange|91251     |
|NM   |Green |91160     |
|WY   |Blue  |91002     |
|UT   |Red   |90995     |
|CO   |Orang

In [29]:
query = """
select State, Color, sum(Count)
from mnmtable
where State = 'CA'
group by State, Color
order by sum(Count)"""

In [30]:
spark.sql(query).show()

+-----+------+----------+
|State| Color|sum(Count)|
+-----+------+----------+
|   CA|  Blue|     89123|
|   CA|Orange|     90311|
|   CA|   Red|     91527|
|   CA| Green|     93505|
|   CA| Brown|     95762|
|   CA|Yellow|    100956|
+-----+------+----------+



In [9]:
# find the aggregate count for California by filtering
ca_count_mnm_df = (mnm_df.select("*")
                       .where(mnm_df.State == 'CA')
                       .groupBy("State", "Color")
                       .sum("Count")
                       .orderBy("sum(Count)", ascending=False))

# show the resulting aggregation for California
ca_count_mnm_df.show(n=10, truncate=False)


+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|CA   |Yellow|100956    |
|CA   |Brown |95762     |
|CA   |Green |93505     |
|CA   |Red   |91527     |
|CA   |Orange|90311     |
|CA   |Blue  |89123     |
+-----+------+----------+

