In [0]:
spark

# Read CSV File in Spark 

In [0]:
flight_data = spark.read.format("csv")\
    .option("header",False)\
    .option("inferSchema",False)\
    .option("mode","FAILFAST")\
    .load("/FileStore/tables/flight_data.csv")
    
    

In [0]:
flight_data.show(5)

+-----------------+-------------------+-----+
|              _c0|                _c1|  _c2|
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
flight_data_header = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .option("mode","FAILFAST")\
    .load("/FileStore/tables/flight_data.csv")

flight_data_header.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
flight_data_header.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)



In [0]:
flight_data_header_schema = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",True)\
    .option("mode","FAILFAST")\
    .load("/FileStore/tables/flight_data.csv")

In [0]:
flight_data_header_schema.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
flight_data_header_schema.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



# schema in spark

In [0]:
flight_data = spark.read.format("csv")\
    .option("header",False)\
    .option("inferSchema",False)\
    .schema(my_schema)\
    .option("mode","FAILFAST")\
    .load("/FileStore/tables/flight_data.csv")

flight_data.show(5)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<command-184348899637104>", line 8, in <module>
    flight_data.show(5)
  File "/databricks/spark/python/pyspark/instrumentation_utils.py", line 48, in wrapper
    res = func(*args, **kwargs)
  File "/databricks/spark/python/pyspark/sql/dataframe.py", line 920, in show
    print(self._jdf.showString(n, 20, vertical))
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(
  File "/databricks/spark/python/pyspark/errors/exceptions.py", line 228, in deco
    return f(*a, **kw)
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o2730.showString.
: org.apache



In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

In [0]:
my_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(),True),  # true means it can be null value
    StructField("ORIGIN_COUNTRY_NAME", StringType(),True),
    StructField("count", IntegerType(),True),
])

In [0]:
flight_data = spark.read.format("csv")\
    .option("header",False)\
    .option("inferSchema",False)\
    .schema(my_schema)\
    .option("mode","FAILFAST")\ #failed because it count column has null value
    .load("/FileStore/tables/flight_data.csv")

flight_data.show(5)

[0;36m  File [0;32m<command-3189809129906557>:5[0;36m[0m
[0;31m    .option("mode","FAILFAST")\ #failed because it count column has null value[0m
[0m                                                                              
^[0m
[0;31mSyntaxError[0m[0;31m:[0m unexpected character after line continuation character


In [0]:
%fs 
ls dbfs:/FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/Sample_Spreadsheet_100_rows.csv,Sample_Spreadsheet_100_rows.csv,10998,1698776214000
dbfs:/FileStore/tables/employee_data_csv-1.csv,employee_data_csv-1.csv,225,1728911978000
dbfs:/FileStore/tables/employee_data_csv.csv,employee_data_csv.csv,225,1728910862000
dbfs:/FileStore/tables/employees.csv,employees.csv,3777,1698809475000
dbfs:/FileStore/tables/employees1.csv,employees1.csv,269,1698948156000
dbfs:/FileStore/tables/flight_data.csv,flight_data.csv,7120,1728836371000
dbfs:/FileStore/tables/movies.dat,movies.dat,171370,1704036093000
dbfs:/FileStore/tables/ratings.dat,ratings.dat,24594131,1704036116000
dbfs:/FileStore/tables/users.dat,users.dat,134368,1704048652000


In [0]:
flight_data = spark.read.format("csv")\
    .option("header",False)\
    .option("inferSchema",False)\
    .schema(my_schema)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/flight_data.csv")

flight_data.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| null|
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
# remove header which is coming from flight_data.csv we have alredy defined schema -.option("skipRows", 1) removed first 1 rows

flight_data = spark.read.format("csv")\
    .option("header",False)\
    .option("skipRows", 1)\
    .option("inferSchema",False)\
    .schema(my_schema)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/flight_data.csv")

flight_data.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
# remove header which is coming from flight_data.csv we have alredy defined schema -.option("skipRows", 3) removed first 3 rows

flight_data1 = spark.read.format("csv")\
    .option("header",False)\
    .option("skipRows", 3)\
    .option("inferSchema",False)\
    .schema(my_schema)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/flight_data.csv")

flight_data1.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
+-----------------+-------------------+-----+
only showing top 5 rows



# Handling corrupted records in spark

In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/employee_data_csv-1.csv")

employee_data1.show(5)



+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/employee_data_csv-1.csv")

employee_data1.show(5)

In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .option("mode","DROPMALFORMED")\
    .load("/FileStore/tables/employee_data_csv-1.csv")

employee_data1.show(5)

+---+------+---+------+------------+--------+
| id|  name|age|salary|     address| nominee|
+---+------+---+------+------------+--------+
|  1|Manish| 26| 75000|       bihar|nominee1|
|  2|Nikita| 23|100000|uttarpradesh|nominee2|
|  5|Vikash| 31|300000|        null|nominee5|
+---+------+---+------+------------+--------+



In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .option("mode","FAILFAST")\
    .load("/FileStore/tables/employee_data_csv-1.csv")

employee_data1.show(5)

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-209729011843705>:7[0m
[1;32m      1[0m employee_data1 [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mcsv[39m[38;5;124m"[39m)\
[1;32m      2[0m     [38;5;241m.[39moption([38;5;124m"[39m[38;5;124mheader[39m[38;5;124m"[39m,[38;5;28;01mTrue[39;00m)\
[1;32m      3[0m     [38;5;241m.[39moption([38;5;124m"[39m[38;5;124minferSchema[39m[38;5;124m"[39m,[38;5;28;01mFalse[39;00m)\
[1;32m      4[0m     [38;5;241m.[39moption([38;5;124m"[39m[38;5;124mmode[39m[38;5;124m"[39m,[38;5;124m"[39m[38;5;124mFAILFAST[39m[38;5;124m"[39m)\
[1;32m      5[0m     [38;5;241m.[39mload([38;5;124m"[39m[38;5;124m/FileStore/tables/employee_data_csv-1.csv[39m[38;5;124m"[39m)
[0;32m----> 7[0m employee_data1[38;5;241m.[39msho

In [0]:
emp_schema = StructType([
    StructField("id", IntegerType(),True),
    StructField("name", StringType(),True),
    StructField("age", IntegerType(),True),
    StructField("salary", IntegerType(),True),
    StructField("address", StringType(),True),
    StructField("nominee", StringType(),True),
    StructField("_corrupt_record", StringType(),True),
])

In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .schema(emp_schema)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/employee_data_csv-1.csv")

employee_data1.show(5)

+---+--------+---+------+------------+--------+--------------------+
| id|    name|age|salary|     address| nominee|     _corrupt_record|
+---+--------+---+------+------------+--------+--------------------+
|  1|  Manish| 26| 75000|       bihar|nominee1|                null|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|                null|
|  3|  Pritam| 22|150000|   Bangalore|   India|3,Pritam,22,15000...|
|  4|Prantosh| 17|200000|     Kolkata|   India|4,Prantosh,17,200...|
|  5|  Vikash| 31|300000|        null|nominee5|                null|
+---+--------+---+------+------------+--------+--------------------+



In [0]:
employee_data1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .schema(emp_schema)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/employee_data_csv-1.csv")

employee_data1.show(truncate=False)

+---+--------+---+------+------------+--------+-------------------------------------------+
|id |name    |age|salary|address     |nominee |_corrupt_record                            |
+---+--------+---+------+------------+--------+-------------------------------------------+
|1  |Manish  |26 |75000 |bihar       |nominee1|null                                       |
|2  |Nikita  |23 |100000|uttarpradesh|nominee2|null                                       |
|3  |Pritam  |22 |150000|Bangalore   |India   |3,Pritam,22,150000,Bangalore,India,nominee3|
|4  |Prantosh|17 |200000|Kolkata     |India   |4,Prantosh,17,200000,Kolkata,India,nominee4|
|5  |Vikash  |31 |300000|null        |nominee5|null                                       |
+---+--------+---+------+------------+--------+-------------------------------------------+



In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",False)\
    .schema(emp_schema)\
    .option("badRecordsPath","FileStores/tables/bad_records")\
    .load("/FileStore/tables/employee_data_csv-1.csv")
employee_data1.show()


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<command-209729011843709>", line 7, in <module>
    employee_data1.show()
  File "/databricks/spark/python/pyspark/instrumentation_utils.py", line 48, in wrapper
    res = func(*args, **kwargs)
  File "/databricks/spark/python/pyspark/sql/dataframe.py", line 920, in show
    print(self._jdf.showString(n, 20, vertical))
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(
  File "/databricks/spark/python/pyspark/errors/exceptions.py", line 228, in deco
    return f(*a, **kw)
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o956.showString.
: org.apach



[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-209729011843712>:1[0m
[0;32m----> 1[0m [43memployee_data1[49m[38;5;241;43m.[39;49m[43mshow[49m[43m([49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlog_success(
[1;32m     50[0m         module_name, class_name, function_name, time[38;5;241m.[39mperf_counter() [38;5;241m-[39m start, signature
[1;32

In [0]:
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",True)\
    .option("badRecordsPath","/FileStore/tables/bad_records")\
    .load("/FileStore/tables/employee_data_csv-1.csv")
employee_data1.show()


+---+------+---+------+------------+--------+
| id|  name|age|salary|     address| nominee|
+---+------+---+------+------------+--------+
|  1|Manish| 26| 75000|       bihar|nominee1|
|  2|Nikita| 23|100000|uttarpradesh|nominee2|
|  5|Vikash| 31|300000|        null|nominee5|
+---+------+---+------+------------+--------+



In [0]:
%fs

ls /FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/Sample_Spreadsheet_100_rows.csv,Sample_Spreadsheet_100_rows.csv,10998,1698776214000
dbfs:/FileStore/tables/bad_records/,bad_records/,0,0
dbfs:/FileStore/tables/employee_data_csv-1.csv,employee_data_csv-1.csv,225,1728911978000
dbfs:/FileStore/tables/employee_data_csv.csv,employee_data_csv.csv,225,1728910862000
dbfs:/FileStore/tables/employees.csv,employees.csv,3777,1698809475000
dbfs:/FileStore/tables/employees1.csv,employees1.csv,269,1698948156000
dbfs:/FileStore/tables/flight_data.csv,flight_data.csv,7120,1728836371000
dbfs:/FileStore/tables/movies.dat,movies.dat,171370,1704036093000
dbfs:/FileStore/tables/ratings.dat,ratings.dat,24594131,1704036116000
dbfs:/FileStore/tables/users.dat,users.dat,134368,1704048652000


In [0]:
%fs 
ls dbfs:/FileStore/tables/bad_records/20241014T181328/bad_records/

path,name,size,modificationTime
dbfs:/FileStore/tables/bad_records/20241014T181328/bad_records/part-00000-338bf7a4-254e-4e43-9e02-a4d26225ff72,part-00000-338bf7a4-254e-4e43-9e02-a4d26225ff72,506,1728929612000


In [0]:
bad_records_df = spark.read.format("json").load("dbfs:/FileStore/tables/bad_records/20241014T181328/bad_records/")

In [0]:
bad_records_df.show(truncate=False)

+----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+
|path                                          |reason                                                                                                                          |record                                     |
+----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+
|dbfs:/FileStore/tables/employee_data_csv-1.csv|org.apache.spark.SparkRuntimeException: [MALFORMED_CSV_RECORD] Malformed CSV record: 3,Pritam,22,150000,Bangalore,India,nominee3|3,Pritam,22,150000,Bangalore,India,nominee3|
|dbfs:/FileStore/tables/employee_data_csv-1.csv|org.apache.spark.SparkRuntimeException: [MALFORMED_CSV_RECORD] M

# how to read json file in pyspark

File uploaded to /FileStore/tables/multi_line_correct.json

File uploaded to /FileStore/tables/file5.json

File uploaded to /FileStore/tables/multi_line_incorrect.json

File uploaded to /FileStore/tables/line_delimited_json_extrafield.json

File uploaded to /FileStore/tables/line_delimited_json.json

File uploaded to /FileStore/tables/corrupted_json.json

In [0]:
# 1. how to read json file
json_df = spark.read.format("json")\
    .option("inferSchema",True)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/line_delimited_json.json")
json_df.show()

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



In [0]:
# 2. what if i have 3 keys in all and 4 keys in one lin
json_df1 = spark.read.format("json")\
    .option("inferSchema",True)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/line_delimited_json_extrafield.json")
json_df1.show()

+---+------+--------+------+
|age|gender|    name|salary|
+---+------+--------+------+
| 20|  null|  Manish| 20000|
| 25|  null|  Nikita| 21000|
| 16|  null|  Pritam| 22000|
| 35|  null|Prantosh| 25000|
| 67|     M|  Vikash| 40000|
+---+------+--------+------+



In [0]:
# 5. read multiline json
json_df2 = spark.read.format("json")\
    .option("inferSchema",True)\
    .option("mode","PERMISSIVE")\
    .option("multiline","true")\
    .load("/FileStore/tables/multi_line_correct.json")
json_df2.show()

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



In [0]:
# 5. read multiline incorrect json
json_df2 = spark.read.format("json")\
    .option("inferSchema",True)\
    .option("mode","PERMISSIVE")\
    .option("multiline","true")\
    .load("/FileStore/tables/multi_line_incorrect.json")
json_df2.show()

+---+------+------+
|age|  name|salary|
+---+------+------+
| 20|Manish| 20000|
+---+------+------+



In [0]:
# 6. what if i read corrupted recors
json_df1 = spark.read.format("json")\
    .option("inferSchema",True)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/corrupted_json.json")
json_df1.show(truncate=False)

+----------------------------------------+----+--------+------+
|_corrupt_record                         |age |name    |salary|
+----------------------------------------+----+--------+------+
|null                                    |20  |Manish  |20000 |
|null                                    |25  |Nikita  |21000 |
|null                                    |16  |Pritam  |22000 |
|null                                    |35  |Prantosh|25000 |
|{"name":"Vikash","age":67,"salary":40000|null|null    |null  |
+----------------------------------------+----+--------+------+



In [0]:
# 5. read nested json
json_df5 = spark.read.format("json")\
    .option("inferSchema","true")\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/file5.json")

# json_df5.show()
json_df5.filter(json_df5["_corrupt_record"].isNotNull()).show(truncate=False)

# Show valid records (those without corrupt records)
json_df5.filter(json_df5["_corrupt_record"].isNull()).show(truncate=False)


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<command-4022323206842845>", line 8, in <module>
    json_df5.filter(json_df5["_corrupt_record"].isNotNull()).show(truncate=False)
  File "/databricks/spark/python/pyspark/instrumentation_utils.py", line 48, in wrapper
    res = func(*args, **kwargs)
  File "/databricks/spark/python/pyspark/sql/dataframe.py", line 933, in show
    print(self._jdf.showString(n, int_truncate, vertical))
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(
  File "/databricks/spark/python/pyspark/errors/exceptions.py", line 234, in deco
    raise converted from None
pyspark.errors.exceptions.AnalysisException: Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only in

