In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Sample Spark').getOrCreate()

## Read single liner json

In [87]:
# Read JSON

df = spark.read.format('json')\
    .option('mode', 'PERMISSIVE')\
    .option('inferSchema', 'true')\
    .load('./data/single_line.json')

df.show()

+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+----------+-----------+-----+-----+-----+-----------+-------+
|               City|Country|Decommisioned|EstimatedPopulation|  Lat|            Location|        LocationText|  LocationType|   Long|        Notes|RecordNumber|State|TaxReturnsFiled|TotalWages|WorldRegion|Xaxis|Yaxis|Zaxis|ZipCodeType|Zipcode|
+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+----------+-----------+-----+-----+-----+-----------+-------+
|        PARC PARQUE|     US|        false|               null|17.96|NA-US-PR-PARC PARQUE|     Parc Parque, PR|NOT ACCEPTABLE| -66.22|         null|           1|   PR|           null|      null|         NA| 0.38|-0.87|  0.3|   STANDARD|    704|
|PASEO COSTA DEL SUR

## Read Multiline JSON


In [86]:
# Read Multiline JSON

df = spark.read.format('json')\
    .option('mode', 'PERMISSIVE')\
    .option('inferSchema', 'true')\
    .option('multiline', 'true')\
    .load('./data/multiline_json.json')

df.show()

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



## Simple CSV

In [85]:
# Read CSV

df = spark.read.format('csv')\
    .option('inferSchema', True)\
    .option('header', True)\
    .load('./data/simple_csv.csv')

df.show()

+----+--------------------+--------------------+------------+--------------------+-----+-----------------+
|year|industry_code_ANZSIC|industry_name_ANZSIC|rme_size_grp|            variable|value|             unit|
+----+--------------------+--------------------+------------+--------------------+-----+-----------------+
|2011|                   A|Agriculture, Fore...|         a_0|       Activity unit|46134|            COUNT|
|2011|                   A|Agriculture, Fore...|         a_0|Rolling mean empl...|    0|            COUNT|
|2011|                   A|Agriculture, Fore...|         a_0|Salaries and wage...|  279|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|Sales, government...| 8187|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|        Total income| 8866|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|   Total expenditure| 7618|DOLLARS(millions)|
|2011|                   A|Agricultur

## CSV with DDL Schema

In [82]:
#  don't consider space in the column names

schema = """
Index int,
CustomerId string,
FirstName string,
LastName string,
Company	string,
City string,
Country string,
Phone1 double,
Phone2 double,
Email string,
SubscriptionDate date,
Website string
"""

spark.read.format('csv')\
    .schema(schema)\
    .option('header', 'true')\
    .load('./data/for_csv_ddl.csv').show()

+-----+---------------+---------+---------+--------------------+-----------------+--------------------+-------------+------+--------------------+----------------+--------------------+
|Index|     CustomerId|FirstName| LastName|             Company|             City|             Country|       Phone1|Phone2|               Email|SubscriptionDate|             Website|
+-----+---------------+---------+---------+--------------------+-----------------+--------------------+-------------+------+--------------------+----------------+--------------------+
|    1|DD37Cf93aecA6Dc|   Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|         null|  null|zunigavanessa@smi...|      2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|  Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|5.153435776E9|  null|     vmata@colon.com|      2021-04-23|http://www.hobbs....|
|    3|6F94879bDAfE5a6|      Roy|    Berry|       Murillo-Perry|    Isabelboroug

## JSON with DDL Schema

In [113]:
# Simple JSON

schema = """
    id INT,
    name STRING
"""
spark.read.format('json')\
    .option('multiline', 'true')\
    .schema(schema)\
    .load('./data/for_ddl1.json').show()



+---+------+
| id|  name|
+---+------+
|  1|Laptop|
+---+------+



In [129]:
# NESTED JSON

schema = """
    id INT,
    name STRING,
    features STRUCT<
    RAM: STRING,
    storage: STRING
    >
"""

df = spark.read.format('json')\
    .option('multiline', 'true')\
    .schema(schema)\
    .load('./data/for_ddl2.json')

df.select('features.RAM').show()

+----+
| RAM|
+----+
|16GB|
+----+



In [140]:
# COMPLEX JSON

schema = """
    products ARRAY<STRUCT<
    id INT,
    name STRING,
    features STRUCT<
    RAM: STRING,
    storage: STRING
    >,
    tags ARRAY<STRING>
    >>
"""

df = spark.read.format('json')\
    .option('multiline', 'true')\
    .schema(schema)\
    .load('./data/for_ddl3.json')

df.select('products.features.RAM').show()


+-----------+
|        RAM|
+-----------+
|[16GB, 8GB]|
+-----------+



In [146]:
# What if I want one row per product?
#  Explode
from pyspark.sql.functions import explode, col
# Explode the products array to get each product in a separate row
df_exploded = df.select(explode(col("products")).alias("product"))

df_exploded.show(truncate=False)
# Filter for the product with 8GB RAM and select the RAM
df_exploded.select("product.features.RAM").filter(col("product.features.RAM") == "8GB").show()

df_exploded.select("product.features.RAM").collect()[0]

+--------------------------------------------------------+
|product                                                 |
+--------------------------------------------------------+
|{1, Laptop, {16GB, 512GB SSD}, [electronics, computers]}|
|{2, Smartphone, {8GB, 128GB}, [electronics, mobile]}    |
+--------------------------------------------------------+

+---+
|RAM|
+---+
|8GB|
+---+



Row(RAM='16GB')