In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()


structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
  StructField('name', StructType([
    StructField('fisrtname', StringType(), True),
    StructField('midname', StringType(), True), 
    StructField('lastname', StringType(), True)
  ])),
  StructField('id', StringType(), True),
  StructField('gender', StringType(), True),
  StructField('salary', IntegerType(), True)
])

df2 = spark.createDataFrame(data= structureData, schema = structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- fisrtname: string (nullable = true)
 |    |-- midname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



In [7]:
from pyspark.sql.functions import col, struct, when
updatedDF = df2.withColumn('OtherInfo', 
    struct(col("id").alias("identifier"),
    col("gender").alias("gender"),
    col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000,"Low")
      .when(col("salary").cast(IntegerType()) < 4000,"Medium")
      .otherwise("High").alias("Salary_Grade")))


updatedDF.printSchema()
updatedDF.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- fisrtname: string (nullable = true)
 |    |-- midname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+--------------------+-----+------+------+------------------------+
|name                |id   |gender|salary|OtherInfo               |
+--------------------+-----+------+------+------------------------+
|{James, , Smith}    |36636|M     |3100  |{36636, M, 3100, Medium}|
|{Michael, Rose, }   |40288|M     |4300  |{40288, M, 4300, High}  |
|{Robert, , Williams}|42114|M     |1400  |{42114, M, 1400, Low}   |
|{Maria, Anne, Jones}|39192|F     |5500  |{39192, F, 5500, High}  |
|{Jen, Mar

In [10]:
print(df2.schema.fieldNames.contains("firstname"))
# print(df2.schema.contains(StructField("firstname",StringType,true)))

AttributeError: 'function' object has no attribute 'contains'