###PySpark Read JSON File into DataFrame
- Reading JSON File in PySpark
- Reading from Multiline JSON File
- Reading Multiple Files at a time
- Reading all Files in a Folder
- Reading files with a user-specified custom schema
- Reading file using PySpark SQL

Syntax: <br>
spark.read.**json**("/path/file.json") <br>
spark.read.**format**("json").**load**("/path/file.json")

In [0]:
file_path = "/Volumes/workspace/training/test_data/json"
file_name1 = "employee.json"
file_name2 = "employee_updated1.json"
file_name3 = "employee_updated2.json"

In [0]:
# Read JSON file into dataframe
full_path = f"{file_path}/{file_name1}"
df = spark.read.option("multiLine", True).json(full_path)
df.printSchema()
display(df)

In [0]:
# Read multiple files
df = spark.read.option("multiLine", True).json(
    [f"{file_path}/{file_name1}", f"{file_path}/{file_name2}", f"{file_path}/{file_name3}"])
display(df) 

In [0]:
# Read all JSON files from a folder
df = spark.read.options(multiLine="true").json(f"{file_path}/*.json")
display(df)

In [0]:
# Reading files with a user-specified custom schema

from pyspark.sql.types import StructType, StructField, StringType, LongType, DateType

employee_schema = StructType([
    StructField("emp_id", LongType(), True),
    StructField("first_name", StringType(), True),
    StructField("country", StringType(), True),
    StructField("salary", LongType(), True),
    StructField("hire_date", DateType(), True)
])

df = spark.read \
    .schema(employee_schema) \
    .option("multiLine", True) \
    .json(full_path)

display(df)


In [0]:
# Reading File using PySpark SQL
spark.sql(f"""
    CREATE OR REPLACE TEMPORARY VIEW employee
    USING json
    OPTIONS (path "{full_path}", multiLine "true")
""")

display(spark.sql("SELECT * FROM employee"))


In [0]:
# Reading the json data using SQL syntax by creating temporary view on the dataframe 

df = spark.read.option("multiLine", True).json(full_path)

# Register it as a temporary view
df.createOrReplaceTempView("vw_employees")

result = spark.sql("SELECT emp_id, first_name, salary FROM vw_employees WHERE age < 30")
display(result)
