# Variables with path to the json file

In [None]:
json_file_mz = "dbfs:/FileStore/shared_uploads/yateed1437@gmail.com/multiline_zipcode.json"
json_file = "dbfs:/FileStore/shared_uploads/yateed1437@gmail.com/zipcodes.json"
json_zc1 = "dbfs:/FileStore/shared_uploads/yateed1437@gmail.com/zipcode2.json"
json_zc2 = "dbfs:/FileStore/shared_uploads/yateed1437@gmail.com/zipcode1.json"

# Reading with json()

In [None]:
df = spark.read.json("dbfs:/FileStore/shared_uploads/yateed1437@gmail.com/zipcodes.json")
df.printSchema()
df.show()

root
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- EstimatedPopulation: long (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Long: double (nullable = true)
 |-- Notes: string (nullable = true)
 |-- RecordNumber: long (nullable = true)
 |-- State: string (nullable = true)
 |-- TaxReturnsFiled: long (nullable = true)
 |-- TotalWages: long (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Xaxis: double (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- Zipcode: long (nullable = true)

+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+-------

# Reading with format()

In [None]:
df = spark.read.format('org.apache.spark.sql.json') \
        .load(json_file)

# Reading with multiline option()

In [None]:
multiline_df = spark.read.option("multiline", "true")\
    .json(json_file_mz)
multiline_df.show()

+-------------------+------------+-----+-----------+-------+
|               City|RecordNumber|State|ZipCodeType|Zipcode|
+-------------------+------------+-----+-----------+-------+
|PASEO COSTA DEL SUR|           2|   PR|   STANDARD|    704|
|       BDA SAN LUIS|          10|   PR|   STANDARD|    709|
+-------------------+------------+-----+-----------+-------+



# Reading multiple files at once

In [None]:
mul_df = spark.read.json([json_zc1, json_zc2])
mul_df.show()

+-------------------+-------+-------------+-----+--------------------+--------------------+--------------+------+------------+-----+-----------+-----+-----+-----+-----------+-------+
|               City|Country|Decommisioned|  Lat|            Location|        LocationText|  LocationType|  Long|RecordNumber|State|WorldRegion|Xaxis|Yaxis|Zaxis|ZipCodeType|Zipcode|
+-------------------+-------+-------------+-----+--------------------+--------------------+--------------+------+------------+-----+-----------+-----+-----+-----+-----------+-------+
|PASEO COSTA DEL SUR|     US|        false|17.96|NA-US-PR-PASEO CO...|Paseo Costa Del S...|NOT ACCEPTABLE|-66.22|           2|   PR|         NA| 0.38|-0.87|  0.3|   STANDARD|    704|
|       BDA SAN LUIS|     US|        false|18.14|NA-US-PR-BDA SAN ...|    Bda San Luis, PR|NOT ACCEPTABLE|-66.26|          10|   PR|         NA| 0.38|-0.86| 0.31|   STANDARD|    709|
|        PARC PARQUE|     US|        false|17.96|NA-US-PR-PARC PARQUE|     Parc Parqu

# Reading all files in Folder

In [None]:
all_json = spark.read.json("dbfs:/FileStore/shared_uploads/yateed1437@gmail.com/*.json")
all_json.show()

+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+----------+-----------+-----+-----+-----+-----------+-------+---------------+
|               City|Country|Decommisioned|EstimatedPopulation|  Lat|            Location|        LocationText|  LocationType|   Long|        Notes|RecordNumber|State|TaxReturnsFiled|TotalWages|WorldRegion|Xaxis|Yaxis|Zaxis|ZipCodeType|Zipcode|_corrupt_record|
+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+----------+-----------+-----+-----+-----+-----------+-------+---------------+
|        PARC PARQUE|     US|        false|               NULL|17.96|NA-US-PR-PARC PARQUE|     Parc Parque, PR|NOT ACCEPTABLE| -66.22|         NULL|           1|   PR|           NULL|      NULL|         NA| 0.38|-0.87

# Defining our own schema

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, BooleanType
schema = StructType([
    StructField("RecordNumber",IntegerType(),True),
      StructField("Zipcode",IntegerType(),True),
      StructField("ZipCodeType",StringType(),True),
      StructField("City",StringType(),True),
      StructField("State",StringType(),True),
      StructField("LocationType",StringType(),True),
      StructField("Lat",DoubleType(),True),
      StructField("Long",DoubleType(),True),
      StructField("Xaxis",IntegerType(),True),
      StructField("Yaxis",DoubleType(),True),
      StructField("Zaxis",DoubleType(),True),
      StructField("WorldRegion",StringType(),True),
      StructField("Country",StringType(),True),
      StructField("LocationText",StringType(),True),
      StructField("Location",StringType(),True),
      StructField("Decommisioned",BooleanType(),True),
      StructField("TaxReturnsFiled",StringType(),True),
      StructField("EstimatedPopulation",IntegerType(),True),
      StructField("TotalWages",IntegerType(),True),
      StructField("Notes",StringType(),True)
])

df1 = spark.read.schema(schema)\
    .json(json_file)
df1.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|   Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|        Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96| -66.22| null|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        false|           null|               null|      null|         null|
|           2|    70

In [None]:
df1.display()

RecordNumber,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Xaxis,Yaxis,Zaxis,WorldRegion,Country,LocationText,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages,Notes
1,704,STANDARD,PARC PARQUE,PR,NOT ACCEPTABLE,17.96,-66.22,,-0.87,0.3,,US,"Parc Parque, PR",NA-US-PR-PARC PARQUE,False,,,,
2,704,STANDARD,PASEO COSTA DEL SUR,PR,NOT ACCEPTABLE,17.96,-66.22,,-0.87,0.3,,US,"Paseo Costa Del Sur, PR",NA-US-PR-PASEO COSTA DEL SUR,False,,,,
10,709,STANDARD,BDA SAN LUIS,PR,NOT ACCEPTABLE,18.14,-66.26,,-0.86,0.31,,US,"Bda San Luis, PR",NA-US-PR-BDA SAN LUIS,False,,,,
61391,76166,UNIQUE,CINGULAR WIRELESS,TX,NOT ACCEPTABLE,32.72,-97.31,,-0.83,0.54,,US,"Cingular Wireless, TX",NA-US-TX-CINGULAR WIRELESS,False,,,,
61392,76177,STANDARD,FORT WORTH,TX,PRIMARY,32.75,-97.33,,-0.83,0.54,,US,"Fort Worth, TX",NA-US-TX-FORT WORTH,False,2126.0,4053.0,122396986.0,
61393,76177,STANDARD,FT WORTH,TX,ACCEPTABLE,32.75,-97.33,,-0.83,0.54,,US,"Ft Worth, TX",NA-US-TX-FT WORTH,False,2126.0,4053.0,122396986.0,
4,704,STANDARD,URB EUGENE RICE,PR,NOT ACCEPTABLE,17.96,-66.22,,-0.87,0.3,,US,"Urb Eugene Rice, PR",NA-US-PR-URB EUGENE RICE,False,,,,
39827,85209,STANDARD,MESA,AZ,PRIMARY,33.37,-111.64,,-0.77,0.55,,US,"Mesa, AZ",NA-US-AZ-MESA,False,14962.0,26883.0,563792730.0,"no NWS data,"
39828,85210,STANDARD,MESA,AZ,PRIMARY,33.38,-111.84,,-0.77,0.55,,US,"Mesa, AZ",NA-US-AZ-MESA,False,14374.0,25446.0,471000465.0,
49345,32046,STANDARD,HILLIARD,FL,PRIMARY,30.69,-81.92,,-0.85,0.51,,US,"Hilliard, FL",NA-US-FL-HILLIARD,False,3922.0,7443.0,133112149.0,


In [None]:
spark.sql(f"CREATE OR REPLACE TEMPORARY VIEW zipcode USING json OPTIONS" + 
          f" (path '{json_file}')")
spark.sql("SELECT * FROM zipcode").show()

+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+----------+-----------+-----+-----+-----+-----------+-------+
|               City|Country|Decommisioned|EstimatedPopulation|  Lat|            Location|        LocationText|  LocationType|   Long|        Notes|RecordNumber|State|TaxReturnsFiled|TotalWages|WorldRegion|Xaxis|Yaxis|Zaxis|ZipCodeType|Zipcode|
+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+----------+-----------+-----+-----+-----+-----------+-------+
|        PARC PARQUE|     US|        false|               null|17.96|NA-US-PR-PARC PARQUE|     Parc Parque, PR|NOT ACCEPTABLE| -66.22|         null|           1|   PR|           null|      null|         NA| 0.38|-0.87|  0.3|   STANDARD|    704|
|PASEO COSTA DEL SUR

# Writing into JSON format

In [None]:
df1.write.json("/temp/spark_output/sample.json")

In [None]:
# Mode
# overwrite – mode is used to overwrite the existing file
# append – To add the data to the existing file
# ignore – Ignores write operation when the file already exists
# errorifexists or error – This is a default option when the file already exists, it returns an error
df1.write.mode("overwrite").json('/temp/spark_output/temp.json')

In [None]:
df2 = spark.read.json("/temp/spark_output/temp.json")
df2.display()

City,Country,Decommisioned,EstimatedPopulation,Lat,Location,LocationText,LocationType,Long,Notes,RecordNumber,State,TaxReturnsFiled,TotalWages,WorldRegion,Yaxis,Zaxis,ZipCodeType,Zipcode
PARC PARQUE,US,False,,17.96,NA-US-PR-PARC PARQUE,"Parc Parque, PR",NOT ACCEPTABLE,-66.22,,1,PR,,,,-0.87,0.3,STANDARD,704
PASEO COSTA DEL SUR,US,False,,17.96,NA-US-PR-PASEO COSTA DEL SUR,"Paseo Costa Del Sur, PR",NOT ACCEPTABLE,-66.22,,2,PR,,,,-0.87,0.3,STANDARD,704
BDA SAN LUIS,US,False,,18.14,NA-US-PR-BDA SAN LUIS,"Bda San Luis, PR",NOT ACCEPTABLE,-66.26,,10,PR,,,,-0.86,0.31,STANDARD,709
CINGULAR WIRELESS,US,False,,32.72,NA-US-TX-CINGULAR WIRELESS,"Cingular Wireless, TX",NOT ACCEPTABLE,-97.31,,61391,TX,,,,-0.83,0.54,UNIQUE,76166
FORT WORTH,US,False,4053.0,32.75,NA-US-TX-FORT WORTH,"Fort Worth, TX",PRIMARY,-97.33,,61392,TX,2126.0,122396986.0,,-0.83,0.54,STANDARD,76177
FT WORTH,US,False,4053.0,32.75,NA-US-TX-FT WORTH,"Ft Worth, TX",ACCEPTABLE,-97.33,,61393,TX,2126.0,122396986.0,,-0.83,0.54,STANDARD,76177
URB EUGENE RICE,US,False,,17.96,NA-US-PR-URB EUGENE RICE,"Urb Eugene Rice, PR",NOT ACCEPTABLE,-66.22,,4,PR,,,,-0.87,0.3,STANDARD,704
MESA,US,False,26883.0,33.37,NA-US-AZ-MESA,"Mesa, AZ",PRIMARY,-111.64,"no NWS data,",39827,AZ,14962.0,563792730.0,,-0.77,0.55,STANDARD,85209
MESA,US,False,25446.0,33.38,NA-US-AZ-MESA,"Mesa, AZ",PRIMARY,-111.84,,39828,AZ,14374.0,471000465.0,,-0.77,0.55,STANDARD,85210
HILLIARD,US,False,7443.0,30.69,NA-US-FL-HILLIARD,"Hilliard, FL",PRIMARY,-81.92,,49345,FL,3922.0,133112149.0,,-0.85,0.51,STANDARD,32046
