In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
ddlSchema = "customer_id long, fullname struct<firstname:string, lastname:string>, city string"

In [4]:
df = spark.read.format("json").schema(ddlSchema).load("/public/trendytech/datasets/customer_nested/*")

In [5]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
|          1| {sumit, mittal}|bangalore|
+-----------+----------------+---------+



In [6]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



In [10]:
from pyspark.sql.types import *

In [22]:
customer_schema = StructType([
StructField("customer_id", LongType()),
StructField("fullname", StructType([StructField("firstname", StringType()), StructField("lastname", StringType())])),
StructField("city", LongType()),
])

In [23]:
df = spark.read.format("json").schema(customer_schema).load("/public/trendytech/datasets/customer_nested/*")

In [24]:
df.show()

+-----------+----------------+----+
|customer_id|        fullname|city|
+-----------+----------------+----+
|          2|    {ram, kumar}|null|
|          3|{vijay, shankar}|null|
|          1| {sumit, mittal}|null|
+-----------+----------------+----+



In [25]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: long (nullable = true)



In [26]:
customer_list = [
    (1, ("sumit", "mittal"), "bangalore"),
    (2, ("ram", "kumar"), "hyderabad"),
    (1, ("vijay", "shankar"), "pune"),
]

In [27]:
ddlSchema = "customer_id long, fullname struct<firstname:string, lastname:string>, city string"

In [28]:
df = spark.createDataFrame(customer_list, ddlSchema)

In [29]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1| {sumit, mittal}|bangalore|
|          2|    {ram, kumar}|hyderabad|
|          1|{vijay, shankar}|     pune|
+-----------+----------------+---------+



In [32]:
customer_schema = StructType([
StructField("customer_id", LongType()),
StructField("fullname", StructType([StructField("firstname", StringType()), StructField("lastname", StringType())])),
StructField("city", StringType()),
])

In [33]:
df = spark.createDataFrame(customer_list, customer_schema)

In [34]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)

