In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("file-io").getOrCreate()

2021-09-22 08:48:51,296 WARN util.Utils: Your hostname, tb-LinuxBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
2021-09-22 08:48:51,300 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
2021-09-22 08:48:52,520 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# CSV

Read

In [11]:
df = spark.read.format("csv").options(header=True,inferSchema=True).load("/sparkdata/flightData.csv")

In [12]:
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [13]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [15]:
'''If we do not infer schema, then we can create a custom schema of the DF and then load the file in the DF.
But for this, we ousl have to be aware of the data present in the dataset.'''

'If we do not infer schema, then we can create a custom schema of the DF and then load the file in the DF.\nBut for this, we ousl have to be aware of the data present in the dataset.'

In [16]:
df = spark.read.format("csv").options(header=True).load("/sparkdata/flightData.csv")

In [73]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [21]:
schema = StructType()\
                    .add("Destination_Country",StringType(),False)\
                    .add("Origin_Country",StringType(),False)\
                    .add("Flight_Count",IntegerType(),False)

In [22]:
df = spark.read.format("csv").options(header=True).schema(schema=schema).load("/sparkdata/flightData.csv")

In [24]:
df.show(truncate=False)

+------------------------+----------------+------------+
|Destination_Country     |Origin_Country  |Flight_Count|
+------------------------+----------------+------------+
|United States           |Romania         |15          |
|United States           |Croatia         |1           |
|United States           |Ireland         |344         |
|Egypt                   |United States   |15          |
|United States           |India           |62          |
|United States           |Singapore       |1           |
|United States           |Grenada         |62          |
|Costa Rica              |United States   |588         |
|Senegal                 |United States   |40          |
|Moldova                 |United States   |1           |
|United States           |Sint Maarten    |325         |
|United States           |Marshall Islands|39          |
|Guyana                  |United States   |64          |
|Malta                   |United States   |1           |
|Anguilla                |Unite

2021-09-22 09:23:37,835 WARN csv.CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: Destination_Country, Origin_Country, Flight_Count
Expected: Destination_Country but found: DEST_COUNTRY_NAME
CSV file: hdfs://localhost:9000/sparkdata/flightData.csv


Transform

In [25]:
# Find out how many flights are there from India to United States

In [34]:
df.filter((df.Origin_Country=="India")&(df.Destination_Country=="United States")).select(df.Flight_Count).show()

+------------+
|Flight_Count|
+------------+
|          62|
+------------+



2021-09-22 09:28:13,329 WARN csv.CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: Destination_Country, Origin_Country, Flight_Count
Expected: Destination_Country but found: DEST_COUNTRY_NAME
CSV file: hdfs://localhost:9000/sparkdata/flightData.csv


In [35]:
# Find out where do flights go from India

In [40]:
df.filter(df.Destination_Country=="United States").show()

+-------------------+--------------------+------------+
|Destination_Country|      Origin_Country|Flight_Count|
+-------------------+--------------------+------------+
|      United States|             Romania|          15|
|      United States|             Croatia|           1|
|      United States|             Ireland|         344|
|      United States|               India|          62|
|      United States|           Singapore|           1|
|      United States|             Grenada|          62|
|      United States|        Sint Maarten|         325|
|      United States|    Marshall Islands|          39|
|      United States|            Paraguay|           6|
|      United States|           Gibraltar|           1|
|      United States|Federated States ...|          69|
|      United States|              Russia|         161|
|      United States|         Netherlands|         660|
|      United States|             Senegal|          42|
|      United States|              Angola|      

2021-09-22 09:29:47,228 WARN csv.CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: Destination_Country, Origin_Country, Flight_Count
Expected: Destination_Country but found: DEST_COUNTRY_NAME
CSV file: hdfs://localhost:9000/sparkdata/flightData.csv


In [41]:
# If Destination_Country == 'United States', change it to "USA"
# If Origin_Country == "United States", change it to "USA"

In [44]:
from pyspark.sql.functions import when

In [49]:
df = df.withColumn("Destination_Country", when(df.Destination_Country == "United States", "USA")\
                .otherwise(df.Destination_Country))\
    .withColumn("Origin_Country", when(df.Origin_Country == "United States", "USA")\
                .otherwise(df.Origin_Country))

In [50]:
df.show()

+--------------------+----------------+------------+
| Destination_Country|  Origin_Country|Flight_Count|
+--------------------+----------------+------------+
|                 USA|         Romania|          15|
|                 USA|         Croatia|           1|
|                 USA|         Ireland|         344|
|               Egypt|             USA|          15|
|                 USA|           India|          62|
|                 USA|       Singapore|           1|
|                 USA|         Grenada|          62|
|          Costa Rica|             USA|         588|
|             Senegal|             USA|          40|
|             Moldova|             USA|           1|
|                 USA|    Sint Maarten|         325|
|                 USA|Marshall Islands|          39|
|              Guyana|             USA|          64|
|               Malta|             USA|           1|
|            Anguilla|             USA|          41|
|             Bolivia|             USA|       

2021-09-22 09:52:58,808 WARN csv.CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: Destination_Country, Origin_Country, Flight_Count
Expected: Destination_Country but found: DEST_COUNTRY_NAME
CSV file: hdfs://localhost:9000/sparkdata/flightData.csv


write

In [52]:
df.write.format("csv").options(header=True).mode("overwrite").save("/sparkdata/flights_data_transformed")
# This is stored as a directory and within it are success file and csv file

2021-09-22 09:57:49,867 WARN csv.CSVHeaderChecker: CSV header does not conform to the schema.
 Header: DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count
 Schema: Destination_Country, Origin_Country, Flight_Count
Expected: Destination_Country but found: DEST_COUNTRY_NAME
CSV file: hdfs://localhost:9000/sparkdata/flightData.csv


# Parquet

In [56]:
df = spark.read.csv("/data/retail_db/categories/part-00000")
# Define schema in any of the below ways

In [71]:
schema = StructType()\
                     .add("S.No.",IntegerType(),False)\
                     .add("Quantity.",IntegerType(),True)\
                    .add("Category",StringType(),False)                         

In [74]:
schema = StructType([
                    StructField("S.No.",IntegerType(),False),\
                    StructField("Quantity",IntegerType(),True),\
                    StructField("Category",StringType(),False)
                    ])

In [75]:
df = spark.read.format("csv")\
            .options(header=True)\
            .schema(schema=schema)\
            .load("/data/retail_db/categories/part-00000")

In [77]:
df.show()

+-----+--------+-------------------+
|S.No.|Quantity|           Category|
+-----+--------+-------------------+
|    2|       2|             Soccer|
|    3|       2|Baseball & Softball|
|    4|       2|         Basketball|
|    5|       2|           Lacrosse|
|    6|       2|   Tennis & Racquet|
|    7|       2|             Hockey|
|    8|       2|        More Sports|
|    9|       3|   Cardio Equipment|
|   10|       3|  Strength Training|
|   11|       3|Fitness Accessories|
|   12|       3|       Boxing & MMA|
|   13|       3|        Electronics|
|   14|       3|     Yoga & Pilates|
|   15|       3|  Training by Sport|
|   16|       3|    As Seen on  TV!|
|   17|       4|             Cleats|
|   18|       4|     Men's Footwear|
|   19|       4|   Women's Footwear|
|   20|       4|     Kids' Footwear|
|   21|       4|     Featured Shops|
+-----+--------+-------------------+
only showing top 20 rows



2021-09-22 21:58:10,254 WARN csv.CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 1, 2, Football
 Schema: S.No., Quantity, Category
Expected: S.No. but found: 1
CSV file: hdfs://localhost:9000/data/retail_db/categories/part-00000


In [78]:
# Now save this file as parquet

In [79]:
df.write.format("parquet").mode("overwrite").options(header=True).save("/sparkdata/categories.parquet")

2021-09-22 21:59:56,227 WARN csv.CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 1, 2, Football
 Schema: S.No., Quantity, Category
Expected: S.No. but found: 1
CSV file: hdfs://localhost:9000/data/retail_db/categories/part-00000
                                                                                

In [80]:
'''parquet_file = /sparkdata/categories.parquet/part-00000-ee70d5b8-a569-4230-966c-1cc8650c3e1d-c000.snappy.parquet'''

'parquet_file = /sparkdata/categories.parquet/part-00000-ee70d5b8-a569-4230-966c-1cc8650c3e1d-c000.snappy.parquet'

In [81]:
df = spark.read.format("parquet").\
    load("/sparkdata/categories.parquet/part-00000-ee70d5b8-a569-4230-966c-1cc8650c3e1d-c000.snappy.parquet")

[Stage 30:>                                                         (0 + 1) / 1]                                                                                

In [82]:
df.show()

+-----+--------+-------------------+
|S.No.|Quantity|           Category|
+-----+--------+-------------------+
|    2|       2|             Soccer|
|    3|       2|Baseball & Softball|
|    4|       2|         Basketball|
|    5|       2|           Lacrosse|
|    6|       2|   Tennis & Racquet|
|    7|       2|             Hockey|
|    8|       2|        More Sports|
|    9|       3|   Cardio Equipment|
|   10|       3|  Strength Training|
|   11|       3|Fitness Accessories|
|   12|       3|       Boxing & MMA|
|   13|       3|        Electronics|
|   14|       3|     Yoga & Pilates|
|   15|       3|  Training by Sport|
|   16|       3|    As Seen on  TV!|
|   17|       4|             Cleats|
|   18|       4|     Men's Footwear|
|   19|       4|   Women's Footwear|
|   20|       4|     Kids' Footwear|
|   21|       4|     Featured Shops|
+-----+--------+-------------------+
only showing top 20 rows



[Stage 31:>                                                         (0 + 1) / 1]                                                                                

# json

In [85]:
df = spark.read.format("json").options(header=True)\
.load("/data/retail_db_json/products/part-r-00000-158b7037-4a23-47e6-8cb3-8cbf878beff7")

In [86]:
df.show()

+-------------------+-------------------+----------+--------------------+--------------------+-------------+
|product_category_id|product_description|product_id|       product_image|        product_name|product_price|
+-------------------+-------------------+----------+--------------------+--------------------+-------------+
|                  2|                   |         1|http://images.acm...|Quest Q64 10 FT. ...|        59.98|
|                  2|                   |         2|http://images.acm...|Under Armour Men'...|       129.99|
|                  2|                   |         3|http://images.acm...|Under Armour Men'...|        89.99|
|                  2|                   |         4|http://images.acm...|Under Armour Men'...|        89.99|
|                  2|                   |         5|http://images.acm...|Riddell Youth Rev...|       199.99|
|                  2|                   |         6|http://images.acm...|Jordan Men's VI R...|       134.99|
|                  