In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("spark optimizations")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [6]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '
df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [7]:
df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [8]:
df.rdd.getNumPartitions()

9

In [9]:
customers_schema = """customerid long, customer_fname string, customer_lname string, username string, password string,
address string, city string, state string, pincode long"""

In [10]:
cust_df = spark.read.format("csv").schema(customers_schema).load("/public/trendytech/retail_db/customers")

In [11]:
cust_df.show()

+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|         1|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|         2|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|         3|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|         4|          Mary|         Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|         5|        Robert|        Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|         6|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ| 

In [12]:
cust_df.rdd.getNumPartitions()

1

In [13]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'10485760b'

In [19]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold','-1')

In [20]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'-1'

In [22]:
df.join(cust_df,df.customer_id==cust_df.customerid,'inner').write.format('noop').mode("overwrite").save()

In [23]:
1024*1024

1048576

In [26]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold','10485760b')

In [27]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'10485760b'

In [28]:
df.join(cust_df,df.customer_id==cust_df.customerid,'inner').write.format('noop').mode("overwrite").save()

In [14]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '
df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/retail_db/ordersnew")

In [15]:
mapping_schema = 'order_status string, code int'

mapping_df = spark.read.format("csv").option("delimiter","|").schema(mapping_schema).load("/public/trendytech/datasets/mapping_data")

In [17]:
mapping_df.rdd.getNumPartitions()

1

In [19]:
df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|    2480|2013-08-07 00:00:...|       3807|    COMPLETE|
|   30479|2014-01-30 00:00:...|       9265|    COMPLETE|
|    2481|2013-08-07 00:00:...|       2476|    COMPLETE|
|   30481|2014-01-30 00:00:...|       9240|    COMPLETE|
|    2483|2013-08-07 00:00:...|      10453|    COMPLETE|
|   30484|2014-01-30 00:00:...|       2876|    COMPLETE|
|    2484|2013-08-07 00:00:...|       9256|    COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|    COMPLETE|
|    2488|2013-08-07 00:00:...|       1255|    COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|    COMPLETE|
|    2491|2013-08-07 00:00:...|        247|    COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|    COMPLETE|
|    2495|2013-08-07 00:00:...|       9011|    COMPLETE|
|   30489|2014-01-30 00:00:...|       5717|    COMPLETE|
|    2498|2013-08-07 00:00:...|

In [20]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold','-1') #Disable broadcast join

In [23]:
df.join(mapping_df,df.order_status==mapping_df.order_status,"inner").write.format("noop").mode("overwrite").save()

In [24]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '
orders_df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/retail_db/ordersnew")

In [29]:
customers_schema = """customerid long, customer_fname string, customer_lname string, username string, password string,
address string, city string, state string, pincode long"""
customers_df = spark.read.format("csv").\
schema(customers_schema).\
load("/public/trendytech/retail_db/customersnew")

In [30]:
orders_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|    2480|2013-08-07 00:00:...|       3807|    COMPLETE|
|   30479|2014-01-30 00:00:...|       9265|    COMPLETE|
|    2481|2013-08-07 00:00:...|       2476|    COMPLETE|
|   30481|2014-01-30 00:00:...|       9240|    COMPLETE|
|    2483|2013-08-07 00:00:...|      10453|    COMPLETE|
|   30484|2014-01-30 00:00:...|       2876|    COMPLETE|
|    2484|2013-08-07 00:00:...|       9256|    COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|    COMPLETE|
|    2488|2013-08-07 00:00:...|       1255|    COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|    COMPLETE|
|    2491|2013-08-07 00:00:...|        247|    COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|    COMPLETE|
|    2495|2013-08-07 00:00:...|       9011|    COMPLETE|
|   30489|2014-01-30 00:00:...|       5717|    COMPLETE|
|    2498|2013-08-07 00:00:...|

In [31]:
customers_df.show()

+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|         1|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|         2|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|         3|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|         4|          Mary|         Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|         5|        Robert|        Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|         6|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ| 

In [33]:
orders_df.join(customers_df.distinct(),orders_df.customer_id==customers_df.distinct().customerid,"inner").write.format("noop").mode("overwrite").save()