In [1]:
import os
import sys
from pyspark.sql import SparkSession
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.appName("Show").getOrCreate()

In [28]:
from pyspark.sql.types import StructType,StructField,StringType, IntegerType

In [7]:
Schema = StructType([
    StructField("name",StructType([
        StructField("FirstName",StringType() ,True),
        StructField("MiddleName",StringType(),True),
        StructField("LastName",StringType(),True),
    ])),
    StructField("age",StringType(),True),
    StructField("PHoneNumber",StringType(),True)
])

In [16]:
Data = [(("Viswanadh","","Gandimenu"),23,9390981330),
        (("Harsha","Vardhan","Gandimenu"),26,9390981330),
        (("Suraya","Lakshmi","Gandimenu"),45,9390981330),
        (("Simhachalam","","Gandimenu"),50,9390981330),
       ]

In [17]:
DF = spark.createDataFrame(data = Data,schema = Schema)

In [21]:
DF.printSchema()
DF.show(20,truncate = False)

root
 |-- name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- MiddleName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- age: string (nullable = true)
 |-- PHoneNumber: string (nullable = true)

+----------------------------+---+-----------+
|name                        |age|PHoneNumber|
+----------------------------+---+-----------+
|{Viswanadh, , Gandimenu}    |23 |9390981330 |
|{Harsha, Vardhan, Gandimenu}|26 |9390981330 |
|{Suraya, Lakshmi, Gandimenu}|45 |9390981330 |
|{Simhachalam, , Gandimenu}  |50 |9390981330 |
+----------------------------+---+-----------+



In [24]:
Schema2 = StructType([
    StructField("name",StructType([
        StructField("FirstName",StringType() ,True),
        StructField("MiddleName",StringType(),True),
        StructField("LastName",StringType(),True),
    ])),
    StructField("age",StringType(),False),
    StructField("PHoneNumber",StringType(),True)
])
Data2 = [(("Viswanadh","","Gandimenu"),23,9390981330),
        (("Harsha","Vardhan","Gandimenu"),26,9390981330),
        (("Suraya","Lakshmi","Gandimenu"),"",9390981330),
        (("Simhachalam","","Gandimenu"),50,9390981330),
       ]

In [26]:
DF = spark.createDataFrame(data = Data2,schema = Schema2)
DF.printSchema()
DF.show(20,truncate = False)

root
 |-- name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- MiddleName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- age: string (nullable = false)
 |-- PHoneNumber: string (nullable = true)

+----------------------------+---+-----------+
|name                        |age|PHoneNumber|
+----------------------------+---+-----------+
|{Viswanadh, , Gandimenu}    |23 |9390981330 |
|{Harsha, Vardhan, Gandimenu}|26 |9390981330 |
|{Suraya, Lakshmi, Gandimenu}|   |9390981330 |
|{Simhachalam, , Gandimenu}  |50 |9390981330 |
+----------------------------+---+-----------+



In [29]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



In [30]:
from pyspark.sql.functions import col,struct,when
updatedDF = df2.withColumn("OtherInfo", 
    struct(col("id").alias("identifier"),
    col("gender").alias("gender"),
    col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000,"Low")
      .when(col("salary").cast(IntegerType()) < 4000,"Medium")
      .otherwise("High").alias("Salary_Grade")
  )).drop("id","gender","salary")

updatedDF.printSchema()
updatedDF.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+--------------------+------------------------+
|name                |OtherInfo               |
+--------------------+------------------------+
|{James, , Smith}    |{36636, M, 3100, Medium}|
|{Michael, Rose, }   |{40288, M, 4300, High}  |
|{Robert, , Williams}|{42114, M, 1400, Low}   |
|{Maria, Anne, Jones}|{39192, F, 5500, High}  |
|{Jen, Mary, Brown}  |{, F, -1, Low}          |
+--------------------+------------------------+



In [48]:
print(updatedDF.schema.json())

{"fields":[{"metadata":{},"name":"name","nullable":true,"type":{"fields":[{"metadata":{},"name":"firstname","nullable":true,"type":"string"},{"metadata":{},"name":"middlename","nullable":true,"type":"string"},{"metadata":{},"name":"lastname","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"OtherInfo","nullable":false,"type":{"fields":[{"metadata":{},"name":"identifier","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"salary","nullable":true,"type":"integer"},{"metadata":{},"name":"Salary_Grade","nullable":false,"type":"string"}],"type":"struct"}}],"type":"struct"}


In [32]:
# Using SQL ArrayType and MapType
from pyspark.sql.types import ArrayType,MapType
arrayDataSchema = StructType([
    StructField("names",StructType([
        StructField("FirstName",StringType(),True),
        StructField("MiddleName",StringType(),True),
        StructField("LastName",StringType(),True),
    ])),
    StructField("Hobbies",ArrayType(StringType()),True),
    StructField("Propeties",MapType(StringType(),StringType()),True)
])

In [51]:
display(arrayDataSchema)

StructType([StructField('names', StructType([StructField('FirstName', StringType(), True), StructField('MiddleName', StringType(), True), StructField('LastName', StringType(), True)]), True), StructField('Hobbies', ArrayType(StringType(), True), True), StructField('Propeties', MapType(StringType(), StringType(), True), True)])

In [68]:
import json

path = r"C:\Users\viswa\OneDrive\Documents\schema.json"
with open(path) as jf:
    schemaJson = jf.read()
schemaFromJson = StructType.fromJson(json.loads(schemaJson))

In [73]:
schemaFromJson

StructType([StructField('name', StructType([StructField('firstname', StringType(), True), StructField('middlename', StringType(), True), StructField('lastname', StringType(), True)]), True), StructField('dob', StringType(), True), StructField('gender', StringType(), True), StructField('salary', IntegerType(), True)])

In [80]:

print(DF.schema.fieldNames.contains("firstname"))


AttributeError: 'function' object has no attribute 'contains'