In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("spark joins")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [2]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '
orders_df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/orders_sample.csv")

In [4]:
customers_schema = """customerid long, customer_fname string, customer_lname string, username string, password string,
address string, city string, state string, pincode long"""
customers_df = spark.read.format("csv").\
schema(customers_schema).\
load("/public/trendytech/datasets/customers_sample.csv")

In [6]:
customers_df.show()

+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|     11599|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|       256|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|     12111|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|      8827|          Mary|         Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|     11318|        Robert|        Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|      7130|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ| 

In [7]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [8]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"inner").show(50)

+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|order_id|          order_date|customer_id|   order_status|customerid|customer_fname|customer_lname| username| password|             address|       city|state|pincode|
+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|     11599|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|Brownsville|   TX|  78521|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|       256|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|  Littleton|   CO|  80126|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|     12111|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|     Caguas|   PR| 

In [9]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"left").show(50)

+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|order_id|          order_date|customer_id|   order_status|customerid|customer_fname|customer_lname| username| password|             address|       city|state|pincode|
+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|     11599|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|Brownsville|   TX|  78521|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|       256|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|  Littleton|   CO|  80126|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|     12111|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|     Caguas|   PR| 

In [12]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"right").show(50)

+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|order_id|          order_date|customer_id|   order_status|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|     11599|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|       256|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|     12111|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Ca

In [13]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"right").where("order_status is NULL").show(50)

+--------+----------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|order_id|order_date|customer_id|order_status|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+--------+----------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|    null|      null|       null|        null|        11|          Mary|       Huffman|XXXXXXXXX|XXXXXXXXX|    3169 Stony Woods|       Caguas|   PR|    725|
|    null|      null|       null|        null|        12|   Christopher|         Smith|XXXXXXXXX|XXXXXXXXX|5594 Jagged Ember...|  San Antonio|   TX|  78227|
|    null|      null|       null|        null|        13|          Mary|       Baldwin|XXXXXXXXX|XXXXXXXXX|7922 Iron Oak Gar...|       Caguas|   PR|    725|
|    null|      null|       null|        null|        14| 

In [14]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"full").show(50)

+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|order_id|          order_date|customer_id|   order_status|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+--------+--------------------+-----------+---------------+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|      7130|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ|   7055|
|    null|                null|       null|           null|        19|     Stephanie|      Mitchell|XXXXXXXXX|XXXXXXXXX|3543 Red Treasure...|       Caguas|   PR|    725|
|      15|2013-07-25 00:00:...|       2568|       COMPLETE|      null|          null|          null|     null|     null|                null|         

In [16]:
customers_df.join(orders_df,orders_df.customer_id==customers_df.customerid,"semi").show(50)

+----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|customerid|customer_fname|customer_lname| username| password|             address|       city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|     11599|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|Brownsville|   TX|  78521|
|       256|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|  Littleton|   CO|  80126|
|     12111|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|     Caguas|   PR|    725|
|      8827|          Mary|         Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common| San Marcos|   CA|  92069|
|     11318|        Robert|        Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|     Caguas|   PR|    725|
|      7130|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|    Passaic|   NJ|   7055|
|      453

In [17]:
customers_df.join(orders_df,orders_df.customer_id==customers_df.customerid,"anti").show(50)

+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|        11|          Mary|       Huffman|XXXXXXXXX|XXXXXXXXX|    3169 Stony Woods|       Caguas|   PR|    725|
|        12|   Christopher|         Smith|XXXXXXXXX|XXXXXXXXX|5594 Jagged Ember...|  San Antonio|   TX|  78227|
|        13|          Mary|       Baldwin|XXXXXXXXX|XXXXXXXXX|7922 Iron Oak Gar...|       Caguas|   PR|    725|
|        14|     Katherine|         Smith|XXXXXXXXX|XXXXXXXXX|5666 Hazy Pony Sq...|  Pico Rivera|   CA|  90660|
|        15|          Jane|          Luna|XXXXXXXXX|XXXXXXXXX|    673 Burning Glen|      Fontana|   CA|  92336|
|        16|       Tiffany|         Smith|XXXXXXXXX|XXXXXXXXX|      6651 Iron Port|       Caguas|   PR| 