In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("partitionBy")\
.enableHiveSupport()\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [55]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '

In [32]:
df = spark.read.format("csv").\
schema(order_schema).\
load("/user/itv009490/spark_writer_demo2/part*")

In [33]:
df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [34]:
df.createOrReplaceTempView("orders")

In [35]:
spark.sql("select count(*) from orders where order_status = 'CLOSED' ").show()

+--------+
|count(1)|
+--------+
| 2833500|
+--------+



In [36]:
df.write.format("csv").mode("overwrite").partitionBy("order_status").\
option("path","/user/itv009490/spark_writer_demo3").save()

In [37]:
df.rdd.getNumPartitions()

9

In [56]:
df = spark.read.format("csv").\
schema(order_schema).\
load("/user/itv009490/spark_writer_demo3/")

In [47]:
df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|    2480|2013-08-07 00:00:...|       3807|    COMPLETE|
|   30479|2014-01-30 00:00:...|       9265|    COMPLETE|
|    2481|2013-08-07 00:00:...|       2476|    COMPLETE|
|   30481|2014-01-30 00:00:...|       9240|    COMPLETE|
|    2483|2013-08-07 00:00:...|      10453|    COMPLETE|
|   30484|2014-01-30 00:00:...|       2876|    COMPLETE|
|    2484|2013-08-07 00:00:...|       9256|    COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|    COMPLETE|
|    2488|2013-08-07 00:00:...|       1255|    COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|    COMPLETE|
|    2491|2013-08-07 00:00:...|        247|    COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|    COMPLETE|
|    2495|2013-08-07 00:00:...|       9011|    COMPLETE|
|   30489|2014-01-30 00:00:...|       5717|    COMPLETE|
|    2498|2013-08-07 00:00:...|

In [57]:
df.createOrReplaceTempView("orders")

In [58]:
spark.sql("select count(*) from orders where order_status = 'CLOSED' ").show()

+--------+
|count(1)|
+--------+
| 2833500|
+--------+



In [59]:
spark.sql("select count(*) from orders where order_status = 'PENDING_PAYMENT' ").show()

+--------+
|count(1)|
+--------+
| 5636250|
+--------+



In [60]:
spark.sql("select count(*) from orders where customer_id = 8827 ").show()

+--------+
|count(1)|
+--------+
|    2250|
+--------+



In [63]:
df2 = spark.read.format("csv").option("inferSchema",True).\
load("/public/trendytech/retail_db/customers/part-00000")

In [68]:
df2 = df2.toDF("customer_id","firstname","lastname","email","password","street","city","state","zipcode")

In [69]:
df2.show()

+-----------+-----------+---------+---------+---------+--------------------+-------------+-----+-------+
|customer_id|  firstname| lastname|    email| password|              street|         city|state|zipcode|
+-----------+-----------+---------+---------+---------+--------------------+-------------+-----+-------+
|          1|    Richard|Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|          2|       Mary|  Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|          3|        Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|          4|       Mary|    Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|          5|     Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|          6|       Mary|    Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ|   7055|
|          7|    Melissa|   Wilcox|XXXXXXXXX|XXXXXXXXX|

In [77]:
#2 levels pf partitioning

df2.write.format("parquet").mode("overwrite").partitionBy("state","city").\
option("path","/user/itv009490/partition_demo_output").save()

In [78]:
!hdfs dfs -ls /user/itv009490/partition_demo_output

Found 45 items
-rw-r--r--   3 itv009490 supergroup          0 2023-12-01 14:42 /user/itv009490/partition_demo_output/_SUCCESS
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=AL
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=AR
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=AZ
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:42 /user/itv009490/partition_demo_output/state=CA
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:42 /user/itv009490/partition_demo_output/state=CO
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:42 /user/itv009490/partition_demo_output/state=CT
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:42 /user/itv009490/partition_demo_output/state=DC
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:42 /user/itv009490/partition_demo_ou

In [79]:
!hdfs dfs -ls /user/itv009490/partition_demo_output/state=CA

Found 151 items
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=CA/city=Alameda
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=CA/city=Alhambra
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=CA/city=Anaheim
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=CA/city=Antioch
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=CA/city=Azusa
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=CA/city=Bakersfield
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=CA/city=Baldwin Park
drwxr-xr-x   - itv009490 supergroup          0 2023-12-01 14:41 /user/itv009490/partition_demo_output/state=

In [80]:
df3 = spark.read.format("parquet").option("inferSchema","true").load("/user/itv009490/partition_demo_output")

In [81]:
df3.show()

+-----------+---------+---------+---------+---------+--------------------+-------+-----+------+
|customer_id|firstname| lastname|    email| password|              street|zipcode|state|  city|
+-----------+---------+---------+---------+---------+--------------------+-------+-----+------+
|          3|      Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|    725|   PR|Caguas|
|          5|   Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|    725|   PR|Caguas|
|          7|  Melissa|   Wilcox|XXXXXXXXX|XXXXXXXXX|9453 High Concession|    725|   PR|Caguas|
|          9|     Mary|    Perez|XXXXXXXXX|XXXXXXXXX| 3616 Quaking Street|    725|   PR|Caguas|
|         11|     Mary|  Huffman|XXXXXXXXX|XXXXXXXXX|    3169 Stony Woods|    725|   PR|Caguas|
|         13|     Mary|  Baldwin|XXXXXXXXX|XXXXXXXXX|7922 Iron Oak Gar...|    725|   PR|Caguas|
|         16|  Tiffany|    Smith|XXXXXXXXX|XXXXXXXXX|      6651 Iron Port|    725|   PR|Caguas|
|         19|Stephanie| Mitchell|XXXXXXX

In [82]:
df3.createOrReplaceTempView("customers")

In [83]:
spark.sql("""select count(*) from customers where state = 'CA' """).show()

+--------+
|count(1)|
+--------+
|    2012|
+--------+



In [84]:
spark.sql("""select count(*) from customers where state = 'AL' """).show()

+--------+
|count(1)|
+--------+
|       3|
+--------+



In [87]:
spark.sql("""select * from customers where city='Caguas' and customer_id = 19 """).show()

+-----------+---------+--------+---------+---------+--------------------+-------+-----+------+
|customer_id|firstname|lastname|    email| password|              street|zipcode|state|  city|
+-----------+---------+--------+---------+---------+--------------------+-------+-----+------+
|         19|Stephanie|Mitchell|XXXXXXXXX|XXXXXXXXX|3543 Red Treasure...|    725|   PR|Caguas|
+-----------+---------+--------+---------+---------+--------------------+-------+-----+------+



In [88]:
spark.sql("""select * from customers where state='CA' and customer_id = 4 """).show()

+-----------+---------+--------+---------+---------+------------------+-------+-----+----------+
|customer_id|firstname|lastname|    email| password|            street|zipcode|state|      city|
+-----------+---------+--------+---------+---------+------------------+-------+-----+----------+
|          4|     Mary|   Jones|XXXXXXXXX|XXXXXXXXX|8324 Little Common|  92069|   CA|San Marcos|
+-----------+---------+--------+---------+---------+------------------+-------+-----+----------+

