#Belajar Pyspark - Membaca File JSON

Dalam notebook ini kita akan belajar tentang bagaimana membaca file csv ke dalam dataframe, beserta penerapan beberapa parameternya.

In [1]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=df9a8fdc4dfb30484f66ef78ff226db822eb93a162410aabefe80f1d9a46f82e
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
spark = SparkSession.builder.appName("Belajar PySpark - Membaca file csv").getOrCreate()

##Membaca File JSON

In [4]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/people.json

--2023-10-25 10:13:37--  https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/people.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152 [text/plain]
Saving to: ‘people.json’


2023-10-25 10:13:38 (5.60 MB/s) - ‘people.json’ saved [152/152]



In [5]:
!cat people.json

{ "name": "Chris", "age": 23, "city": "New York" },
{ "name": "Emily", "age": 19, "city": "Atlanta" },
{ "name": "Joe", "age": 32, "city": "New York" }


###Menggunakan fungsi `spark.read.json`

In [6]:
df = spark.read.json("people.json")
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- name: string (nullable = true)

+---+--------+-----+
|age|    city| name|
+---+--------+-----+
| 23|New York|Chris|
| 19| Atlanta|Emily|
| 32|New York|  Joe|
+---+--------+-----+



###Menggunakan fungsi general reader `spark.read.format("json")`

In [8]:
df = spark.read.format("json").load("people.json")
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- name: string (nullable = true)

+---+--------+-----+
|age|    city| name|
+---+--------+-----+
| 23|New York|Chris|
| 19| Atlanta|Emily|
| 32|New York|  Joe|
+---+--------+-----+



##Membaca File Multiline JSON

In [None]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/people_multi.json

--2023-10-25 01:12:36--  https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/people_multi.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 216 [text/plain]
Saving to: ‘people_multi.json’


2023-10-25 01:12:36 (10.5 MB/s) - ‘people_multi.json’ saved [216/216]



In [None]:
!cat people_multi.json

[
  { 
    "name": "Chris", 
    "age": 23, 
    "city": "New York" 
  },
  { 
    "name": "Emily", 
    "age": 19, 
    "city": "Atlanta" 
  },
  { 
    "name": "Joe", 
    "age": 32, 
    "city": "New York" 
  }
]


###Menggunakan fungsi `spark.read.json`

In [None]:
df = spark.read.json("people_multi.json", multiLine=True)
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- name: string (nullable = true)

+---+--------+-----+
|age|    city| name|
+---+--------+-----+
| 23|New York|Chris|
| 19| Atlanta|Emily|
| 32|New York|  Joe|
+---+--------+-----+



###Menggunakan fungsi general reader `spark.read.format("json")`

In [None]:
df = spark.read.format("json") \
        .options(multiLine=True) \
        .load("people_multi.json")

df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- name: string (nullable = true)

+---+--------+-----+
|age|    city| name|
+---+--------+-----+
| 23|New York|Chris|
| 19| Atlanta|Emily|
| 32|New York|  Joe|
+---+--------+-----+



##Membaca File JSON dengan perintah SQL `CREATE TEMPORARY VIEW`

In [None]:
spark.sql("CREATE OR REPLACE TEMPORARY VIEW people" +
          " USING json" +
          " OPTIONS" +
          "   (path 'people_multi.json'," +
          "     multiline 'True')")

spark.sql("select * from people").show()

+---+--------+-----+
|age|    city| name|
+---+--------+-----+
| 23|New York|Chris|
| 19| Atlanta|Emily|
| 32|New York|  Joe|
+---+--------+-----+



##Menentukan Urutan Kolom

In [None]:
df = spark.read.json("people_multi.json", multiLine=True) \
          .select("name", "age", "city")
df.show()

+-----+---+--------+
| name|age|    city|
+-----+---+--------+
|Chris| 23|New York|
|Emily| 19| Atlanta|
|  Joe| 32|New York|
+-----+---+--------+



##Opsi Pada JSON Reader

In [None]:
!wget https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/people_nonquote.json

--2023-10-25 03:22:21--  https://raw.githubusercontent.com/urfie/Seri-Belajar-PySpark/main/dataset/people_nonquote.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88 [text/plain]
Saving to: ‘people_nonquote.json’


2023-10-25 03:22:21 (6.68 MB/s) - ‘people_nonquote.json’ saved [88/88]



In [None]:
!cat people_nonquote.json

[
  { 
    name: "Chris", 
    age: 30 
  },
  { 
    name: "Emily", 
    age: 19
  }
]


###Menggunakan fungsi `spark.read.json`

In [None]:
df = spark.read.json("people_nonquote.json",
                     multiLine=True,
                     allowUnquotedFieldNames=True)
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+-----+
|age| name|
+---+-----+
| 30|Chris|
| 19|Emily|
+---+-----+



###Menggunakan fungsi `spark.read.format("json")`

In [None]:
df = spark.read.format("json") \
                .options(multiLine=True) \
                .options(allowUnquotedFieldNames=True) \
                .load("people_nonquote.json")
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+-----+
|age| name|
+---+-----+
| 30|Chris|
| 19|Emily|
+---+-----+



###Menggunakan SQL Statement

In [None]:
spark.sql("CREATE OR REPLACE TEMPORARY VIEW people" +
          " USING json" +
          " OPTIONS" +
          "   (path 'people_nonquote.json'," +
          "     multiline 'True',"
          "     allowUnquotedFieldNames='True')")

spark.sql("select * from people").show()

+---+-----+
|age| name|
+---+-----+
| 30|Chris|
| 19|Emily|
+---+-----+

