In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading Complex Data Formats")
    .master("local[*]")
    .getOrCreate()
)

spark


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/15 16:23:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
df_parquet = spark.read.format("parquet").load("data/sales_data.parquet")

                                                                                

In [10]:
df_parquet.printSchema()

root
 |-- transacted_at: timestamp (nullable = true)
 |-- trx_id: integer (nullable = true)
 |-- retailer_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- city_id: integer (nullable = true)



Parquet stores the metadata along with file

In [11]:
df_parquet.show(truncate=False)

                                                                                

+-------------------+----------+-----------+-----------------------------------------------+-------+----------+
|transacted_at      |trx_id    |retailer_id|description                                    |amount |city_id   |
+-------------------+----------+-----------+-----------------------------------------------+-------+----------+
|2017-11-24 19:00:00|1995601912|2077350195 |Walgreen       11-25                           |197.23 |216510442 |
|2017-11-24 19:00:00|1734117021|644879053  |unkn    ppd id: 768641     11-26               |8.58   |930259917 |
|2017-11-24 19:00:00|1734117022|847200066  |Wal-Mart  ppd id: 555914     Algiers    11-26  |1737.26|1646415505|
|2017-11-24 19:00:00|1734117030|1953761884 |Home Depot     ppd id: 265293   11-25          |384.5  |287177635 |
|2017-11-24 19:00:00|1734117089|1898522855 |Target        11-25                            |66.33  |1855530529|
|2017-11-24 19:00:00|1734117117|997626433  |Sears  ppd id: 856095  Ashgabat                |298.87 |9573

In [13]:
df_orc = spark.read.format("orc").load("data/sales_data.orc")

In [14]:
# Benefits of Columnar Storage

# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [16]:
@get_time
def x():
    df = spark.read.format("parquet").load("data/sales_data.parquet")
    df.count()

Execution time: 791.6536331176758 ms


In [18]:
@get_time
def x():
    df = spark.read.format("parquet").load("data/sales_data.parquet")
    df.select("trx_id").count()

Execution time: 255.80644607543945 ms


Observe that the time to read when we give just one column is significantly less

# BONUS TIP
# RECURSIVE READ

```
sales_recursive

|__ sales_1\1.parquet

|__ sales_1\sales_2\2.parquet
```

In [None]:
df_1 = spark.read.format("parquet").option("recursiveFileLookup", True).load("data/input/sales_recursive/")
df_1.show()