#Belajar PySpark - Nested Schema pada DataFrame

...

In [None]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=9f6b212895ddab09650eefb388f1c70fed20c77e38670fb93458641a67d3fb66
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [None]:
spark = SparkSession.builder.appName("Belajar PySpark - Skema DataFrame").getOrCreate()

##Mendefinisikan Nested Schema

In [None]:
mySchema = StructType([
    StructField("nama", StringType(), True),
    StructField("jurusan", StringType(), True),
    StructField("nilai", StructType([
        StructField("nilai1", IntegerType(), True),
        StructField("nilai2", IntegerType(), True),
        StructField("nilai3", IntegerType(), True)
    ]), True)
])


###Menggunakan nested schema pada DataFrame

In [None]:
data = [['Agus','F',[100,150,150]],
        ['Budi','B',[200,100,150]],
        ['Dina','F',[150,150,130]],
        ['Dedi','B', [50,100,100]]]

df = spark.createDataFrame(data, mySchema)
df.show()
df.printSchema()

+----+-------+---------------+
|nama|jurusan|          nilai|
+----+-------+---------------+
|Agus|      F|{100, 150, 150}|
|Budi|      B|{200, 100, 150}|
|Dina|      F|{150, 150, 130}|
|Dedi|      B| {50, 100, 100}|
+----+-------+---------------+

root
 |-- nama: string (nullable = true)
 |-- jurusan: string (nullable = true)
 |-- nilai: struct (nullable = true)
 |    |-- nilai1: integer (nullable = true)
 |    |-- nilai2: integer (nullable = true)
 |    |-- nilai3: integer (nullable = true)



##Mengakses Kolom pada Nested Schema

In [None]:
df.select("nama", "nilai.nilai2").show()

+----+------+
|nama|nilai2|
+----+------+
|Agus|   150|
|Budi|   100|
|Dina|   150|
|Dedi|   100|
+----+------+



In [None]:
import pyspark.sql.functions as F

df.groupby("jurusan") \
      .agg(F.avg("nilai.nilai3") \
           .alias("rerata_nilai3")).show()

+-------+-------------+
|jurusan|rerata_nilai3|
+-------+-------------+
|      F|        140.0|
|      B|        125.0|
+-------+-------------+



In [None]:
df.filter(df["nilai.nilai2"] < 150).show()

+----+-------+---------------+
|nama|jurusan|          nilai|
+----+-------+---------------+
|Budi|      B|{200, 100, 150}|
|Dedi|      B| {50, 100, 100}|
+----+-------+---------------+



##Mengubah Nested Schema Menjadi Flat Schema

In [None]:
df.select("nama","jurusan","nilai.*").show()

+----+-------+------+------+------+
|nama|jurusan|nilai1|nilai2|nilai3|
+----+-------+------+------+------+
|Agus|      F|   100|   150|   150|
|Budi|      B|   200|   100|   150|
|Dina|      F|   150|   150|   130|
|Dedi|      B|    50|   100|   100|
+----+-------+------+------+------+



##Menyimpan dan Membaca Nested Schema ke JSON File

###Menyimpan skema ke file JSON

Generate string JSON dari skema dataframe

In [None]:
json_string = df.schema.json()
print(json_string)

{"fields":[{"metadata":{},"name":"nama","nullable":true,"type":"string"},{"metadata":{},"name":"jurusan","nullable":true,"type":"string"},{"metadata":{},"name":"nilai","nullable":true,"type":{"fields":[{"metadata":{},"name":"nilai1","nullable":true,"type":"integer"},{"metadata":{},"name":"nilai2","nullable":true,"type":"integer"},{"metadata":{},"name":"nilai3","nullable":true,"type":"integer"}],"type":"struct"}}],"type":"struct"}


Simpan file skema ke file `schema.json`

In [None]:
text_file = open("schema.json", "w")
text_file.write(json_string)
text_file.close()

In [None]:
!cat schema.json

{"fields":[{"metadata":{},"name":"nama","nullable":true,"type":"string"},{"metadata":{},"name":"jurusan","nullable":true,"type":"string"},{"metadata":{},"name":"nilai","nullable":true,"type":{"fields":[{"metadata":{},"name":"nilai1","nullable":true,"type":"integer"},{"metadata":{},"name":"nilai2","nullable":true,"type":"integer"},{"metadata":{},"name":"nilai3","nullable":true,"type":"integer"}],"type":"struct"}}],"type":"struct"}

###Membaca Skema dari File JSON

In [None]:
import json

f = open("schema.json")
json_dict = json.load(f)
f.close()

json_dict

{'fields': [{'metadata': {},
   'name': 'nama',
   'nullable': True,
   'type': 'string'},
  {'metadata': {}, 'name': 'jurusan', 'nullable': True, 'type': 'string'},
  {'metadata': {},
   'name': 'nilai',
   'nullable': True,
   'type': {'fields': [{'metadata': {},
      'name': 'nilai1',
      'nullable': True,
      'type': 'integer'},
     {'metadata': {}, 'name': 'nilai2', 'nullable': True, 'type': 'integer'},
     {'metadata': {}, 'name': 'nilai3', 'nullable': True, 'type': 'integer'}],
    'type': 'struct'}}],
 'type': 'struct'}

In [17]:
schemaFromJson = StructType.fromJson(json_dict)


df3 = spark.createDataFrame(data, schemaFromJson)
df3.show()
df3.printSchema()


+----+-------+---------------+
|nama|jurusan|          nilai|
+----+-------+---------------+
|Agus|      F|{100, 150, 150}|
|Budi|      B|{200, 100, 150}|
|Dina|      F|{150, 150, 130}|
|Dedi|      B| {50, 100, 100}|
+----+-------+---------------+

root
 |-- nama: string (nullable = true)
 |-- jurusan: string (nullable = true)
 |-- nilai: struct (nullable = true)
 |    |-- nilai1: integer (nullable = true)
 |    |-- nilai2: integer (nullable = true)
 |    |-- nilai3: integer (nullable = true)



###Menggunakan JSON  Schema untuk definisi Skema DataFrame

In [19]:
data = [['Agus','F',[100,150,150]],
        ['Budi','B',[200,100,150]],
        ['Dina','F',[150,150,130]],
        ['Dedi','B', [50,100,100]]]

schemaFromJson = StructType.fromJson(json_dict)

df3 = spark.createDataFrame(data, schemaFromJson)
df3.show()
df3.printSchema()

+----+-------+---------------+
|nama|jurusan|          nilai|
+----+-------+---------------+
|Agus|      F|{100, 150, 150}|
|Budi|      B|{200, 100, 150}|
|Dina|      F|{150, 150, 130}|
|Dedi|      B| {50, 100, 100}|
+----+-------+---------------+

root
 |-- nama: string (nullable = true)
 |-- jurusan: string (nullable = true)
 |-- nilai: struct (nullable = true)
 |    |-- nilai1: integer (nullable = true)
 |    |-- nilai2: integer (nullable = true)
 |    |-- nilai3: integer (nullable = true)

