In [2]:
# %load requirements.txt

import findspark
findspark.init()

import pyspark
print(findspark.find())

from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster("local").setAppName("Spark Practice")

sc = SparkContext(conf=conf)

from pyspark.sql import SparkSession
spark = SparkSession(sc)


/opt/spark


# 1. 문자열 파일 만들어서 처리하기

In [96]:
employee =  ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']

In [97]:
employee_RDD = sc.parallelize(employee)

In [98]:
employee = spark.read.json(employee_RDD)
employee.show()

+----------------+----+
|         address|name|
+----------------+----+
|[Columbus, Ohio]| Yin|
+----------------+----+



In [119]:
employee.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- name: string (nullable = true)



# 2. JSON 파일 만들어서 처리하기

In [113]:
%%writefile employee.json 
{"id" : "1201", "name" : "satish",    "age" : "25"},
{"id" : "1202", "name" : "krishna", "age" : "28"},
{"id" : "1203", "name" : "amith",    "age" : "39"},
{"id" : "1204", "name" : "javed",    "age" : "23"},
{"id" : "1205", "name" : "prudvi",   "age" : "23"}

Overwriting employee.json


In [114]:
dfs = spark.read.json("employee.json")

In [115]:
dfs

DataFrame[age: string, id: string, name: string]

In [118]:
dfs.printSchema()

root
 |-- age: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [116]:
dfs.count()

5

In [117]:
dfs.show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|krishna|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
+---+----+-------+



### 열 이름으로 조회하기

In [120]:
dfs.select("name").show()

+-------+
|   name|
+-------+
| satish|
|krishna|
|  amith|
|  javed|
| prudvi|
+-------+



In [121]:
dfs.select("name",'age').show()

+-------+---+
|   name|age|
+-------+---+
| satish| 25|
|krishna| 28|
|  amith| 39|
|  javed| 23|
| prudvi| 23|
+-------+---+



In [122]:
dfs.select('ID', "name",'age').show()

+----+-------+---+
|  ID|   name|age|
+----+-------+---+
|1201| satish| 25|
|1202|krishna| 28|
|1203|  amith| 39|
|1204|  javed| 23|
|1205| prudvi| 23|
+----+-------+---+



## 필터링하기

In [125]:
dfs.filter(dfs.age > 23).show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|krishna|
| 39|1203|  amith|
+---+----+-------+



In [127]:
dfs.filter("age > 23").show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|krishna|
| 39|1203|  amith|
+---+----+-------+



## 그룹화하기

In [126]:
dfs.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 28|    1|
| 23|    2|
| 25|    1|
| 39|    1|
+---+-----+



# 3. parquet 파일 만들기

In [129]:
dfs.write.parquet("employee.parquet")

In [131]:
parqfile = spark.read.parquet("employee.parquet")

In [132]:
parqfile

DataFrame[age: string, id: string, name: string]

In [133]:
parqfile.printSchema()

root
 |-- age: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [134]:
parqfile.count()

5

## 임시 테이블로 전환해서 sql 처리하기 

In [136]:
parqfile.createOrReplaceTempView("employee")

In [140]:
spark.sql("SeleCT * FROM employee").show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|krishna|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
+---+----+-------+



In [141]:
parqfile.select("age", "id","name").show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|krishna|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
+---+----+-------+



In [142]:
parqfile.select("*").show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|krishna|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
+---+----+-------+



###  기술통계 요약표

In [144]:
parqfile.summary().show()

+-------+------------------+------------------+------+
|summary|               age|                id|  name|
+-------+------------------+------------------+------+
|  count|                 5|                 5|     5|
|   mean|              27.6|            1203.0|  null|
| stddev|6.6932802122726045|1.5811388300841898|  null|
|    min|                23|              1201| amith|
|    25%|              23.0|            1202.0|  null|
|    50%|              25.0|            1203.0|  null|
|    75%|              28.0|            1204.0|  null|
|    max|                39|              1205|satish|
+-------+------------------+------------------+------+



### 첫번째 행을 읽어오기

In [145]:
parqfile.head()

Row(age='25', id='1201', name='satish')

In [147]:
parqfile.take(1)

[Row(age='25', id='1201', name='satish')]