# 1. Reading CSV File

In [0]:
file_path = "dbfs:/FileStore/shared_uploads/yateed1437@gmail.com/zipcodes.csv"
# Method 1
df = spark.read.option("inferSchema", True)\
    .option("header", True)\
    .csv((file_path))
df.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|   Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|        Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96| -66.22| 0.38|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        false|           null|               null|      null|         null|
|           2|    70

In [0]:
# Method 2
df2 = spark.read.options(header = True, inferSchema = True) \
    .csv(file_path)
df2.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|   Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|        Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96| -66.22| 0.38|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        false|           null|               null|      null|         null|
|           2|    70

In [0]:
# Method 3 - Loading the CSV
df3 = spark.read.format("csv")\
    .load(file_path)
df3.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|         _c0|    _c1|        _c2|                _c3|  _c4|           _c5|  _c6|    _c7|  _c8|  _c9| _c10|       _c11|   _c12|                _c13|                _c14|         _c15|           _c16|               _c17|      _c18|         _c19|
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|   Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|        Notes|
|           1|    70

In [0]:
# df4 is created in order to check how nullValue works
# In this, Where ever PR is written, it will be marked as null through out the entire DataFrame
df4 = spark.read.option("inferSchema", True) \
    .option("header", "true") \
    .option("nullValue", "PR") \
        .csv(file_path)
df4.createOrReplaceTempView("temp")

spark.sql("""
          SELECT * from temp
          """)\
              .display()

RecordNumber,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Xaxis,Yaxis,Zaxis,WorldRegion,Country,LocationText,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages,Notes
1,704,STANDARD,PARC PARQUE,,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Parc Parque, PR",NA-US-PR-PARC PARQUE,False,,,,
2,704,STANDARD,PASEO COSTA DEL SUR,,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Paseo Costa Del Sur, PR",NA-US-PR-PASEO COSTA DEL SUR,False,,,,
10,709,STANDARD,BDA SAN LUIS,,NOT ACCEPTABLE,18.14,-66.26,0.38,-0.86,0.31,,US,"Bda San Luis, PR",NA-US-PR-BDA SAN LUIS,False,,,,
61391,76166,UNIQUE,CINGULAR WIRELESS,TX,NOT ACCEPTABLE,32.72,-97.31,-0.1,-0.83,0.54,,US,"Cingular Wireless, TX",NA-US-TX-CINGULAR WIRELESS,False,,,,
61392,76177,STANDARD,FORT WORTH,TX,PRIMARY,32.75,-97.33,-0.1,-0.83,0.54,,US,"Fort Worth, TX",NA-US-TX-FORT WORTH,False,2126.0,4053.0,122396986.0,
61393,76177,STANDARD,FT WORTH,TX,ACCEPTABLE,32.75,-97.33,-0.1,-0.83,0.54,,US,"Ft Worth, TX",NA-US-TX-FT WORTH,False,2126.0,4053.0,122396986.0,
4,704,STANDARD,URB EUGENE RICE,,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Urb Eugene Rice, PR",NA-US-PR-URB EUGENE RICE,False,,,,
39827,85209,STANDARD,MESA,AZ,PRIMARY,33.37,-111.64,-0.3,-0.77,0.55,,US,"Mesa, AZ",NA-US-AZ-MESA,False,14962.0,26883.0,563792730.0,"no NWS data,"
39828,85210,STANDARD,MESA,AZ,PRIMARY,33.38,-111.84,-0.31,-0.77,0.55,,US,"Mesa, AZ",NA-US-AZ-MESA,False,14374.0,25446.0,471000465.0,
49345,32046,STANDARD,HILLIARD,FL,PRIMARY,30.69,-81.92,0.12,-0.85,0.51,,US,"Hilliard, FL",NA-US-FL-HILLIARD,False,3922.0,7443.0,133112149.0,


In [0]:
# df5 is created in order to understand the concept of dateFormat
df5 = spark.read.option("inferSchema", True) \
    .option("header", True)\
    .option("dateFormat", "dd-MMM-yyyy")\
        .csv(file_path)
df5.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|   Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|        Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96| -66.22| 0.38|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        false|           null|               null|      null|         null|
|           2|    70

In [0]:
# Providing Custom Schema in case we donot want to use the inferSchema
# Create a schema
from pyspark.sql.types import StructType, IntegerType, StringType, DoubleType, BooleanType
schema = StructType() \
    .add("RecordNumber", IntegerType(), True) \
    .add("Zipcode",IntegerType(),True) \
    .add("ZipCodeType",StringType(),True) \
    .add("City",StringType(),True) \
    .add("State",StringType(),True) \
    .add("LocationType",StringType(),True) \
    .add("Lat",DoubleType(),True) \
    .add("Long",DoubleType(),True) \
    .add("Xaxis",IntegerType(),True) \
    .add("Yaxis",DoubleType(),True) \
    .add("Zaxis",DoubleType(),True) \
    .add("WorldRegion",StringType(),True) \
    .add("Country",StringType(),True) \
    .add("LocationText",StringType(),True) \
    .add("Location",StringType(),True) \
    .add("Decommisioned",BooleanType(),True) \
    .add("TaxReturnsFiled",StringType(),True) \
    .add("EstimatedPopulation",IntegerType(),True) \
    .add("TotalWages",IntegerType(),True) \
    .add("Notes",StringType(),True)

# Reading the CSV format into DataFrame
df6 = spark.read.format("csv")\
    .option("header", True)\
    .schema(schema)\
        .load(file_path)
df6.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|   LocationTyp|  Lat|   Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|        Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96| -66.22| null|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        false|           null|               null|      null|         null|
|           2|    70

# 2. Writing DataFrame into CSV

In [0]:
path = "/dbfs/temp/zip.csv"

# Writing into CSV File
df5.write.option("header", True)\
    .option("mode", "overwrite")\
    .csv(path)

# Checking whether it is saved by loading it again
df7 = spark.read.format("csv")\
    .option("header", "True")\
    .option("inferSchema", "True")\
    .load(path)
df7.show()

In [0]:
# Saving into csv
file_p = "/dbfs/temp/zipcode_csv"
df4.write.format("csv")\
    .mode("overwrite")\
    .option("header", "true")\
    .option("inferschema", True)\
        .save(file_p)

# Checking it is existing or not
df8 = spark.read.option("header", "true").csv(file_p)
df8.show()