In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *

spark = SparkSession.builder.master("local").appName("FromJson").getOrCreate()
sc = spark.sparkContext

In [8]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")

"""
for IOT data, we have a column of String type with Json Data can be achieved in following three ways
import from sql functions
json_tuple - disadvantage is we should know the column name, is case sensitive
from_json - preferable method. We need two things, schema of the json column and data of column
to_json - when we have struct type and convert to json, we use to_json
"""
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
                .option("multiline",True) \
                .option("escape","\"") \
                .csv(filepath + "IntFromJson.csv",inferSchema=True)
df.show()

+-------------+--------------+--------------------+
|PartitionDate|        Status|             request|
+-------------+--------------+--------------------+
|   2020-06-30|Internal Error|{"Response":{"Mes...|
|   2020-06-30|       Success|{"Response":{"Mes...|
+-------------+--------------+--------------------+



In [9]:
df.printSchema()

root
 |-- PartitionDate: date (nullable = true)
 |-- Status: string (nullable = true)
 |-- request: string (nullable = true)



In [10]:
df.select("*",json_tuple("request","Response")).drop("request").show()

+-------------+--------------+--------------------+
|PartitionDate|        Status|                  c0|
+-------------+--------------+--------------------+
|   2020-06-30|Internal Error| {"MessageId":15432}|
|   2020-06-30|       Success|{"MessageId":1543...|
+-------------+--------------+--------------------+



In [16]:
#Method1 - using json tuple, is case sensitive use same name otherwise column values will be null
df.select("*",json_tuple("request","Response")).drop("request") \
.select("*",json_tuple("c0","MessageId","Latitude","longitude").alias("MessageId","Latitude","Longitude")) \
.drop("c0").show()

+-------------+--------------+---------+---------+---------+
|PartitionDate|        Status|MessageId| Latitude|Longitude|
+-------------+--------------+---------+---------+---------+
|   2020-06-30|Internal Error|    15432|     NULL|     NULL|
|   2020-06-30|       Success|    15432|-176.2989|   7.3614|
+-------------+--------------+---------+---------+---------+



In [19]:
#Method2 - using from_json
#convet to RDD
#convert RDD column and read json data
df.select(col("request").alias("jsoncol")).rdd.map(lambda x: x.jsoncol).collect()

['{"Response":{"MessageId" : 15432 }}',
 '{"Response":{"MessageId" : 15432,"Latitude":"-176.2989","longitude":"7.3614" }}']

In [25]:
#spark.read.json will accept only rdd or list or string. Here is 
df_jsonsch = spark.read.json(df.select(col("request").alias("jsoncol")).rdd.map(lambda x: x.jsoncol)).schema

In [26]:
df_jsonsch

StructType([StructField('Response', StructType([StructField('Latitude', StringType(), True), StructField('MessageId', LongType(), True), StructField('longitude', StringType(), True)]), True)])

In [27]:
df_fromjson = df.select("*",from_json("request",df_jsonsch).alias("jsonstr"))

In [48]:
df_fromjson.printSchema()

root
 |-- PartitionDate: date (nullable = true)
 |-- Status: string (nullable = true)
 |-- request: string (nullable = true)
 |-- jsonstr: struct (nullable = true)
 |    |-- Response: struct (nullable = true)
 |    |    |-- Latitude: string (nullable = true)
 |    |    |-- MessageId: long (nullable = true)
 |    |    |-- longitude: string (nullable = true)



In [49]:
df_fromjson.show(truncate=0)

+-------------+--------------+-------------------------------------------------------------------------------+----------------------------+
|PartitionDate|Status        |request                                                                        |jsonstr                     |
+-------------+--------------+-------------------------------------------------------------------------------+----------------------------+
|2020-06-30   |Internal Error|{"Response":{"MessageId" : 15432 }}                                            |{{NULL, 15432, NULL}}       |
|2020-06-30   |Success       |{"Response":{"MessageId" : 15432,"Latitude":"-176.2989","longitude":"7.3614" }}|{{-176.2989, 15432, 7.3614}}|
+-------------+--------------+-------------------------------------------------------------------------------+----------------------------+



In [47]:
col1 = df_fromjson.schema['jsonstr'].dataType.names[0]
chk = "jsonstr." + col1 + ".*"
chk

'jsonstr.Response.*'

In [57]:
df_fromjson.select("*",col(chk)).drop("request","jsonstr").show()

+-------------+--------------+---------+---------+---------+
|PartitionDate|        Status| Latitude|MessageId|longitude|
+-------------+--------------+---------+---------+---------+
|   2020-06-30|Internal Error|     NULL|    15432|     NULL|
|   2020-06-30|       Success|-176.2989|    15432|   7.3614|
+-------------+--------------+---------+---------+---------+



In [58]:
df_fromjson.select(col(chk)).show()

+---------+---------+---------+
| Latitude|MessageId|longitude|
+---------+---------+---------+
|     NULL|    15432|     NULL|
|-176.2989|    15432|   7.3614|
+---------+---------+---------+



In [55]:
#Method3 to_json
df_fromjson.select(col("jsonstr.*")).select(to_json(col("Response"))).show()

+--------------------+
|   to_json(Response)|
+--------------------+
| {"MessageId":15432}|
|{"Latitude":"-176...|
+--------------------+

