In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("SparkTest") \
        .getOrCreate()

spark

# Agenda

### 2 Ways to tackle the nested schema 
#### String Method 
#### StructType()

{
    "patient_id": 101,
    "name": {
      "first_name": "Amit",
      "last_name": "Sharma"
    },
    "gender": "Male",
    "age": 45,
    "diagnosis": "Diabetes",
    "admission_date": "2024-01-15",
    "city": "Delhi"
  },

In [3]:
patient = spark.read.format('json'). \
     option('Multiline',True). \
     option('InferSchema',True). \
     load("D:\Datasets\patient_data_nested.json")

In [5]:
patient.show()

+--------------+---+---------+-------------+------+--------------+----------+
|admission_date|age|     city|    diagnosis|gender|          name|patient_id|
+--------------+---+---------+-------------+------+--------------+----------+
|    2024-01-15| 45|    Delhi|     Diabetes|  Male|{Amit, Sharma}|       101|
|    2024-02-10| 32|   Mumbai|       Asthma|Female| {Neha, Verma}|       102|
|    2024-03-05| 29|Bangalore|     Fracture|  Male|{Rahul, Mehta}|       103|
|    2024-03-18| 38|     Pune| Hypertension|Female|{Priya, Singh}|       104|
|    2024-04-02| 41|Ahmedabad|Cardiac Issue|  Male|{Karan, Patel}|       105|
+--------------+---+---------+-------------+------+--------------+----------+



In [6]:
patient.printSchema()

root
 |-- admission_date: string (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- patient_id: long (nullable = true)



### 1st method to define the Schema

In [17]:
schema = 'admission_date date, age long , city string, diagnosis string,gender string, name struct<first_name:string,last_name:string>, patient_id long'

In [22]:
patient_schema = spark.read.format('json'). \
    option('Multiline',True).\
     schema(schema). \
     load("D:\Datasets\patient_data_nested.json")

In [23]:
patient_schema.printSchema()

root
 |-- admission_date: date (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- patient_id: long (nullable = true)



In [24]:
patient_schema.show(2)

+--------------+---+------+---------+------+--------------+----------+
|admission_date|age|  city|diagnosis|gender|          name|patient_id|
+--------------+---+------+---------+------+--------------+----------+
|    2024-01-15| 45| Delhi| Diabetes|  Male|{Amit, Sharma}|       101|
|    2024-02-10| 32|Mumbai|   Asthma|Female| {Neha, Verma}|       102|
+--------------+---+------+---------+------+--------------+----------+
only showing top 2 rows


## Second method for defining the Schema

In [29]:
from pyspark.sql.types import *
schema_struct = StructType([
    StructField('admission_date', DateType()),
    StructField('city', StringType()),
    StructField('diagnosis', StringType()),
    StructField('gender', StringType()),
    StructField('name',
                StructType([
       StructField('first_name', StringType()),
        StructField('last_name', StringType())
    ])),
    StructField('patient_id', LongType())
])

In [30]:
patient_schema_struct = spark.read.format('json'). \
    option('Multiline',True).\
     schema(schema_struct). \
     load("D:\Datasets\patient_data_nested.json")

In [31]:
patient_schema_struct.printSchema()

root
 |-- admission_date: date (nullable = true)
 |-- city: string (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- patient_id: long (nullable = true)



In [32]:
patient_schema_struct.show()

+--------------+---------+-------------+------+--------------+----------+
|admission_date|     city|    diagnosis|gender|          name|patient_id|
+--------------+---------+-------------+------+--------------+----------+
|    2024-01-15|    Delhi|     Diabetes|  Male|{Amit, Sharma}|       101|
|    2024-02-10|   Mumbai|       Asthma|Female| {Neha, Verma}|       102|
|    2024-03-05|Bangalore|     Fracture|  Male|{Rahul, Mehta}|       103|
|    2024-03-18|     Pune| Hypertension|Female|{Priya, Singh}|       104|
|    2024-04-02|Ahmedabad|Cardiac Issue|  Male|{Karan, Patel}|       105|
+--------------+---------+-------------+------+--------------+----------+



In [1]:
spark.stop()

NameError: name 'spark' is not defined