In [4]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config('spark.shuffle.useOldFetchProtocol', 'true'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

##### Note: Either you can use the path mentioned in the video :/public/trendytech/datasets/parquet-schema-evol-demo/csv or get the files from hadoop to local and then put the files in your hadoop directory like the below path:

##### /user/itv006277/datasets/orders1.csv

##### here "itv006277" should be replaced with your id number

In [5]:
orders_schema = 'order_id long, order_date date'

In [6]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema)\
.load("/user/itv006277/datasets/orders1.csv")

In [7]:
orders_df.show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
+--------+----------+



In [8]:
orders_df.write \
.mode("overwrite") \
.option("path", "/user/itv006277/datasets1/parquet") \
.save()

In [9]:
orders_parquet_df = spark.read \
.load("/user/itv006277/datasets1/parquet")

In [10]:
orders_parquet_df.show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
+--------+----------+



In [11]:
orders_schema = 'order_id long, order_date date,customer_id long'

In [12]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema)\
.load("/user/itv006277/datasets/orders2.csv")

In [13]:
orders_df.show()

+--------+----------+-----------+
|order_id|order_date|customer_id|
+--------+----------+-----------+
|       3|2013-07-25|      12111|
|       4|2013-07-25|       8827|
+--------+----------+-----------+



In [14]:
orders_df.write \
.mode("append") \
.option("path", "/user/itv006277/datasets1/parquet") \
.save()

In [15]:
orders_df = spark.read \
.load("/user/itv006277/datasets1/parquet")

In [16]:
orders_df.show()

+--------+----------+-----------+
|order_id|order_date|customer_id|
+--------+----------+-----------+
|       3|2013-07-25|      12111|
|       4|2013-07-25|       8827|
|       1|2013-07-25|       null|
|       2|2013-07-25|       null|
+--------+----------+-----------+



In [17]:
orders_parquet_merged_df = spark.read \
.option("mergeSchema", True) \
.load("/user/itv006277/datasets1/parquet")

In [18]:
orders_parquet_merged_df.show()

+--------+----------+-----------+
|order_id|order_date|customer_id|
+--------+----------+-----------+
|       3|2013-07-25|      12111|
|       4|2013-07-25|       8827|
|       1|2013-07-25|       null|
|       2|2013-07-25|       null|
+--------+----------+-----------+



In [19]:
orders_schema = 'order_id long, order_date date, order_status string, customer_id long'

In [20]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema)\
.load("/user/itv006277/datasets/orders4.csv")

In [21]:
orders_df.write \
.mode("append") \
.option("path", "/user/itv006277/datasets1/parquet") \
.save()

In [22]:
orders_parquet_merged_df = spark.read \
.option("mergeSchema", True) \
.load("/user/itv006277/datasets1/parquet")

In [23]:
orders_parquet_merged_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       3|2013-07-25|      12111|        null|
|       4|2013-07-25|       8827|        null|
|       1|2013-07-25|       null|        null|
|       2|2013-07-25|       null|        null|
+--------+----------+-----------+------------+



In [24]:
orders_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [25]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema)\
.load("/user/itv006277/datasets/orders3.csv")

In [26]:
orders_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
+--------+----------+-----------+------------+



In [27]:
orders_df.write \
.mode("append") \
.option("path", "/user/itv006277/datasets1/parquet") \
.save()

In [28]:
orders_parquet_merged_df = spark.read \
.option("mergeSchema", True) \
.load("/user/itv006277/datasets1/parquet")

In [29]:
orders_parquet_merged_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       3|2013-07-25|      12111|        null|
|       4|2013-07-25|       8827|        null|
|       1|2013-07-25|       null|        null|
|       2|2013-07-25|       null|        null|
+--------+----------+-----------+------------+

