In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkStructExample.com') \
                    .getOrCreate()

In [0]:
data = [
    ("Agus","","Rohmawan","36636","M",3000),
    ("Dilan","Cepmek","","40288","M",4000),
    ("Fajar","","Sadboy","42114","M",4000),
    ("Kekeyi","Doll","Jones","39192","F",4000),
    ("Jenjen","Maryam","Pink","","F",-1),
    ("Jeni","Jeno","Jojo","","F",-1),
    ("Tan","Tin","Tun","42113","M",10000)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|Agus     |          |Rohmawan|36636|M     |3000  |
|Dilan    |Cepmek    |        |40288|M     |4000  |
|Fajar    |          |Sadboy  |42114|M     |4000  |
|Kekeyi   |Doll      |Jones   |39192|F     |4000  |
|Jenjen   |Maryam    |Pink    |     |F     |-1    |
|Jeni     |Jeno      |Jojo    |     |F     |-1    |
|Tan      |Tin       |Tun     |42113|M     |10000 |
+---------+----------+--------+-----+------+------+



In [0]:
#Defining Nested StructType object struct
data2 = [
    (("Agus","","Rohmawan"),"36636","M",7000),
    (("Dilan","Cepmek",""),"40288","M",8000),
    (("Fajar","","Sadboy"),"42114","M",7000),
    (("Kekeyi","Doll","Jones"),"39192","F",7000),
    (("Jenjen","Maryam","Pink"),"","F",-1),
    (("Jeni","Jeno","Jojo"),"","F",-1),
    (("Tan","Tin","Tun"),"42113","M",10000)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])
#define data what you use for data=(variable data name)
df2 = spark.createDataFrame(data=data2,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+----------------------+-----+------+------+
|name                  |id   |gender|salary|
+----------------------+-----+------+------+
|{Agus, , Rohmawan}    |36636|M     |7000  |
|{Dilan, Cepmek, }     |40288|M     |8000  |
|{Fajar, , Sadboy}     |42114|M     |7000  |
|{Kekeyi, Doll, Jones} |39192|F     |7000  |
|{Jenjen, Maryam, Pink}|     |F     |-1    |
|{Jeni, Jeno, Jojo}    |     |F     |-1    |
|{Tan, Tin, Tun}       |42113|M     |10000 |
+----------------------+-----+------+------+



In [0]:
#Adding & Changing struct of the DataFrame
from pyspark.sql.functions import col,struct,when
updatedDF = df2.withColumn("OtherInfo", 
    struct(
        col("id").alias("identifier"),
        col("gender").alias("gender"),
        col("salary").alias("salary"),
    when(
        col("salary").cast(IntegerType()) < 2000,"Low")
      .when(
          col("salary").cast(IntegerType()) < 7000,"Medium")
      .otherwise("High").alias("Salary_Grade")
      )).drop("id","gender","salary")

updatedDF.printSchema()
updatedDF.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+----------------------+-----------------------+
|name                  |OtherInfo              |
+----------------------+-----------------------+
|{Agus, , Rohmawan}    |{36636, M, 7000, High} |
|{Dilan, Cepmek, }     |{40288, M, 8000, High} |
|{Fajar, , Sadboy}     |{42114, M, 7000, High} |
|{Kekeyi, Doll, Jones} |{39192, F, 7000, High} |
|{Jenjen, Maryam, Pink}|{, F, -1, Low}         |
|{Jeni, Jeno, Jojo}    |{, F, -1, Low}         |
|{Tan, Tin, Tun}       |{42113, M, 10000, High}|
+----------------------+-----------------------+



In [0]:
#Using SQL ArrayType and MapType
from pyspark.sql.types import ArrayType,MapType
from pyspark.sql.functions import col,struct,when


In [0]:
arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])

In [0]:
#Creating StructType object struct from JSON file
print(updatedDF.schema.json())

{"fields":[{"metadata":{},"name":"name","nullable":true,"type":{"fields":[{"metadata":{},"name":"firstname","nullable":true,"type":"string"},{"metadata":{},"name":"middlename","nullable":true,"type":"string"},{"metadata":{},"name":"lastname","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"OtherInfo","nullable":false,"type":{"fields":[{"metadata":{},"name":"identifier","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"salary","nullable":true,"type":"integer"},{"metadata":{},"name":"Salary_Grade","nullable":false,"type":"string"}],"type":"struct"}}],"type":"struct"}


In [0]:
#Alternatively, you could also use
updatedDF.schema.simpleString()

Out[37]: 'struct<name:struct<firstname:string,middlename:string,lastname:string>,OtherInfo:struct<identifier:string,gender:string,salary:int,Salary_Grade:string>>'