In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window as WN

In [2]:
spark = SparkSession.builder.master("local[2]").appName("Spark-Query").getOrCreate()

In [3]:
file_path ="E:\\workforce\\spark\\data\\emp_json.json"
# dataframe = spark.read.load(file_path, format='json',multiLine=True,schema=None)
dataframe = spark.read.format("json").option("path",file_path).option("multiLine",True).option("schema",None).load()

In [4]:
dataframe.show(truncate = False)

+-------------------+------+----------------------+------+------+
|address            |fname |languages             |lname |salary|
+-------------------+------+----------------------+------+------+
|[{India}, {Canada}]|Taukir|[english, hindi, urdu]|khan  |30    |
|[{India}, {Canada}]|Zeenat|[english, urdu]       |khan  |50    |
|[{germany}]        |Banana|[tamil, english]      |haider|25    |
+-------------------+------+----------------------+------+------+



In [5]:
dataframe.select(col("fname"),col("address")[0].getItem('country')).show()

+------+------------------+
| fname|address[0].country|
+------+------------------+
|Taukir|             India|
|Zeenat|             India|
|Banana|           germany|
+------+------------------+



In [6]:
dataframe.select(col("fname"),col("languages")[2]).show()

+------+------------+
| fname|languages[2]|
+------+------------+
|Taukir|        urdu|
|Zeenat|        NULL|
|Banana|        NULL|
+------+------------+



In [7]:
dataframe.select(concat(col("fname"),col("lname")).alias("name")).\
filter(col("address")[0].getItem('country') == 'India').show()

+----------+
|      name|
+----------+
|Taukirkhan|
|Zeenatkhan|
+----------+



In [8]:
# dataframe.select(dataframe.fname).show()
# dataframe.select(dataframe.address[0].country).show()

%md
### Creating a new column as Json type

In [95]:
from pyspark.sql.types import IntegerType,StringType,StructField,MapType,StructType,ArrayType

In [34]:
data = [
    {"fname": "Rahul", "lname": "Kumar", "gender": "M", "salary": 10000, "bonus": 1000, "deduction": 500},
    {"fname": "Taukir", "lname": "Khan", "gender": "M", "salary": 12000, "bonus": 2000},
    {"fname": "Pooja", "lname": "Sharma", "gender": "F", "salary": 15000, "bonus": 3000, "deduction": 2000},
     {"fname": "Ravi", "lname": "Kumari", "gender": "other", "salary": 15000, "bonus": 3000, "deduction": 2000}
]
dataframe = spark.createDataFrame(data) 
dataframe.show()

+-----+---------+------+------+------+------+
|bonus|deduction| fname|gender| lname|salary|
+-----+---------+------+------+------+------+
| 1000|      500| Rahul|     M| Kumar| 10000|
| 2000|     NULL|Taukir|     M|  Khan| 12000|
| 3000|     2000| Pooja|     F|Sharma| 15000|
| 3000|     2000|  Ravi| other|Kumari| 15000|
+-----+---------+------+------+------+------+



In [91]:
## Make json type object
import json
d1 = {"device":"mobile","country" : "India",'languages' : ['English','Hindi']}
json_col = lit(json.dumps(d1))
json_col

Column<'{"device": "mobile", "country": "India", "languages": ["English", "Hindi"]}'>

In [92]:
dataframe2 = dataframe.withColumn('src_rec',lit(json_col))
dataframe2.show(truncate = False)

+-----+---------+------+------+------+------+---------------------------------------------------------------------------+
|bonus|deduction|fname |gender|lname |salary|src_rec                                                                    |
+-----+---------+------+------+------+------+---------------------------------------------------------------------------+
|1000 |500      |Rahul |M     |Kumar |10000 |{"device": "mobile", "country": "India", "languages": ["English", "Hindi"]}|
|2000 |NULL     |Taukir|M     |Khan  |12000 |{"device": "mobile", "country": "India", "languages": ["English", "Hindi"]}|
|3000 |2000     |Pooja |F     |Sharma|15000 |{"device": "mobile", "country": "India", "languages": ["English", "Hindi"]}|
|3000 |2000     |Ravi  |other |Kumari|15000 |{"device": "mobile", "country": "India", "languages": ["English", "Hindi"]}|
+-----+---------+------+------+------+------+---------------------------------------------------------------------------+



In [41]:
## We need to conver json string data to Map type in order to access the key and value
##dataframe2.select(col('src_rec.divice')).show()

In [93]:
dataframe2.printSchema()

root
 |-- bonus: long (nullable = true)
 |-- deduction: long (nullable = true)
 |-- fname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- src_rec: string (nullable = false)



In [99]:
# src_rec_type = spark.read.json(dataframe2.rdd.map(lambda row: row.src_rec)).schema
#json_schema = spark.read.json(df.rdd.map(lambda row: row.json)).schema
src_rec_type = StructType(
  [
    StructField('device', StringType(), True),
    StructField('country', StringType(), True),
    StructField('languages', ArrayType(StringType()), True)
  ]
)

dataframe3 = dataframe2.withColumn('src_rec',from_json(col('src_rec'), schema))
dataframe3.show(truncate = False)

+-----+---------+------+------+------+------+---------------------------------+
|bonus|deduction|fname |gender|lname |salary|src_rec                          |
+-----+---------+------+------+------+------+---------------------------------+
|1000 |500      |Rahul |M     |Kumar |10000 |{mobile, India, [English, Hindi]}|
|2000 |NULL     |Taukir|M     |Khan  |12000 |{mobile, India, [English, Hindi]}|
|3000 |2000     |Pooja |F     |Sharma|15000 |{mobile, India, [English, Hindi]}|
|3000 |2000     |Ravi  |other |Kumari|15000 |{mobile, India, [English, Hindi]}|
+-----+---------+------+------+------+------+---------------------------------+



In [100]:
dataframe3.printSchema()

root
 |-- bonus: long (nullable = true)
 |-- deduction: long (nullable = true)
 |-- fname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- src_rec: struct (nullable = true)
 |    |-- device: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- languages: array (nullable = true)
 |    |    |-- element: string (containsNull = true)



In [102]:
dataframe3.select(col('fname'),col('src_rec.country')).show()

+------+-------+
| fname|country|
+------+-------+
| Rahul|  India|
|Taukir|  India|
| Pooja|  India|
|  Ravi|  India|
+------+-------+



In [105]:
json_schema = spark.read.json(dataframe2.rdd.map(lambda row: row.fname)).schema
json_schema

StructType([StructField('_corrupt_record', StringType(), True)])

Column<'fname[schema]'>