In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
import pandas as pd
import numpy as np

In [2]:
sc     = SparkContext('local')  
sqlCtx = SQLContext( sc )
emp    = [('홍길동',1),('이순신',2),('임꺽정',3),('김철수',3),('김철수1',5)]
dept   = [('개발',1), ('연구',2), ('영업',3),('기획',4) ]

empRdd  = sc.parallelize(emp)
deptRdd = sc.parallelize(dept)

In [3]:
# RDD => Data frame.
empDF = empRdd.toDF() 
empDF

DataFrame[_1: string, _2: bigint]

In [4]:
# show() : 데이터 보기. 
empDF.show()

+-------+---+
|     _1| _2|
+-------+---+
| 홍길동|  1|
| 이순신|  2|
| 임꺽정|  3|
| 김철수|  3|
|김철수1|  5|
+-------+---+



In [5]:
# RDD => Data frame
empDF1 = sqlCtx.createDataFrame( emp )
empDF1.show()

+-------+---+
|     _1| _2|
+-------+---+
| 홍길동|  1|
| 이순신|  2|
| 임꺽정|  3|
| 김철수|  3|
|김철수1|  5|
+-------+---+



In [6]:
# RDD.collect() 분산의 데이터 프레임 => Pandas의 데이터프레임.
df = empDF1.toPandas()
df

Unnamed: 0,_1,_2
0,홍길동,1
1,이순신,2
2,임꺽정,3
3,김철수,3
4,김철수1,5


In [7]:
empDF2 = sqlCtx.createDataFrame( emp, ['name', 'deptid'])
empDF2.show()

+-------+------+
|   name|deptid|
+-------+------+
| 홍길동|     1|
| 이순신|     2|
| 임꺽정|     3|
| 김철수|     3|
|김철수1|     5|
+-------+------+



In [8]:
# printSchema() : info의 역할.
empDF2.printSchema() 

root
 |-- name: string (nullable = true)
 |-- deptid: long (nullable = true)



In [9]:
empDF2.createOrReplaceTempView('my')

In [10]:
# < SQL문 >

# sql = " select * from my "
# sql = " select name, deptid from my where deptid > 2 "
# sql = " select name, deptid from my where name like '%김%'"
# sql = " select name from my "
# sql = " select name, deptid from my where name rlike '수$'"
# sql = " select name, deptid from my where name rlike '[김정]'"
# sql = " select name, deptid from my order by name desc"
# sql = " select name, deptid from my order by deptid desc limit 3"
# sql = " select max(deptid) from my"
# sql = " select sum(deptid) from my "
# sql = " select mean(deptid) from my "
# sql = " select avg(deptid) from my "
# sql = " select count(*) from my "  # 전체 열의 개수
# sql = " select sum(deptid), avg(deptid) from my "
# sql = " select name, deptid from my where name rlike '이'"
sql = " select deptid*2 as s from my "

sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+---+
|  s|
+---+
|  2|
|  4|
|  6|
|  6|
| 10|
+---+



#### 문제 1.

- 0. name salary 컬럼명을 가지는 데이터프레임을 만드시요.

- 1. 급여가 가장높은 name과 salary를 출력하시요

- 2. name, salary, tax 를 출력하시요 (tax는 급여에서 세금 3.3을 제한값)

- 3. name 에 '철'이 포함된  name,salary 를 출력하시요

- 4. salary top 5 인 name, salary 출력하시요

- 5. salary  2000  과 4000 사이의 데이터를 출력하시요

0. name salary 컬럼명을 가지는 데이터프레임을 만드시요.

In [11]:
data1 = ['홍길동,1000','이순신,2000','임꺽정,3000','김철수,4000','이황,5000','이이,6000']

In [12]:
dataRdd = sc.parallelize( data1 )

In [13]:
data2 = dataRdd.map( lambda x: x.split(',')).map( lambda x: (x[0], int(x[1])))

In [14]:
df1 = sqlCtx.createDataFrame( data2, ['name', 'salary'])
df1

DataFrame[name: string, salary: bigint]

In [15]:
df1.createOrReplaceTempView('ya')

1. 급여가 가장높은 name과 salary를 출력하시요

In [16]:
sql = " select name, salary from ya order by salary desc limit 1" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+----+------+
|name|salary|
+----+------+
|이이|  6000|
+----+------+



In [17]:
sql = " select * from ya where salary==(select max(salary) from ya) "
sqlDF.show()

+----+------+
|name|salary|
+----+------+
|이이|  6000|
+----+------+



2. name, salary, tax 를 출력하시요 (tax는 급여에서 세금 3.3을 제한값)

In [18]:
sql = " select name, salary, salary*(1-0.033) as tax from ya" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------+--------+
|  name|salary|     tax|
+------+------+--------+
|홍길동|  1000| 967.000|
|이순신|  2000|1934.000|
|임꺽정|  3000|2901.000|
|김철수|  4000|3868.000|
|  이황|  5000|4835.000|
|  이이|  6000|5802.000|
+------+------+--------+



3. name 에 '철'이 포함된 name,salary 를 출력하시요

In [19]:
sql = " select name, salary from ya where name rlike '철' "
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------+
|  name|salary|
+------+------+
|김철수|  4000|
+------+------+



4. salary top 5 인 name, salary 출력하시요

In [20]:
sql = " select name, salary from ya order by salary desc limit 5" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------+
|  name|salary|
+------+------+
|  이이|  6000|
|  이황|  5000|
|김철수|  4000|
|임꺽정|  3000|
|이순신|  2000|
+------+------+



5. salary 2000 과 4000 사이의 데이터를 출력하시요

In [21]:
sql = " select name, salary from ya where salary between 2000 and 4000" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------+
|  name|salary|
+------+------+
|이순신|  2000|
|임꺽정|  3000|
|김철수|  4000|
+------+------+



In [22]:
# createOrReplaceTempView() : SQL임시보기로 등록.
df1.createOrReplaceTempView('emp')

#### subquery

In [23]:
# sql = " select max(salary) from emp" 
sql = " select * from emp where salary==(select max(salary) from emp)" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+----+------+
|name|salary|
+----+------+
|이이|  6000|
+----+------+



In [24]:
# sql = " select max(salary) from emp" 
sql = ''' select name,salary,
        case
            when salary>=4000 then '많음'
            when salary>=2000 then '보통'
            else '적음'
            end as sal
            from emp
            ''' 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------+----+
|  name|salary| sal|
+------+------+----+
|홍길동|  1000|적음|
|이순신|  2000|보통|
|임꺽정|  3000|보통|
|김철수|  4000|많음|
|  이황|  5000|많음|
|  이이|  6000|많음|
+------+------+----+



#### hive q1 함수
- spark sql : hive ql 의 sql문법과 함수를 따른다
- 표준 sql : https://www.w3schools.com/sql/
- spark sql 함수: https://rfriend.tistory.com/213
- https://spark.apache.org/docs/latest/api/sql/index.html

In [25]:
sql = " select name, salary, round(salary*(1-0.033)) as tax from ya" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------+------+----+
|  name|salary| tax|
+------+------+----+
|홍길동|  1000| 967|
|이순신|  2000|1934|
|임꺽정|  3000|2901|
|김철수|  4000|3868|
|  이황|  5000|4835|
|  이이|  6000|5802|
+------+------+----+



In [26]:
sql = " select substr(name,2) as n , salary, round(salary*(1-0.033),2 ) as tax from ya" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+----+------+-------+
|   n|salary|    tax|
+----+------+-------+
|길동|  1000| 967.00|
|순신|  2000|1934.00|
|꺽정|  3000|2901.00|
|철수|  4000|3868.00|
|  황|  5000|4835.00|
|  이|  6000|5802.00|
+----+------+-------+



In [27]:
sql = " select percentile(salary , 0.5 )  from ya" 
sqlDF = sqlCtx.sql(sql)
sqlDF.show()

+------------------------------------------+
|percentile(salary, CAST(0.5 AS DOUBLE), 1)|
+------------------------------------------+
|                                    3500.0|
+------------------------------------------+

