In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config('spark.shuffle.useOldFetchProtocol', 'true'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_schema = "order_id long , order_date string, customer_id long,order_status string"

In [3]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("/public/trendytech/orders/orders_1gb.csv")

In [4]:
customer_schema = "customerid long , customer_fname string , customer_lname string , user_name string,password string , address string, city string, state string, pincode long "

In [5]:
customers_df = spark.read \
.format("csv") \
.schema(customer_schema) \
.load("/public/trendytech/retail_db/customersnew")

In [6]:
orders_df.join(customers_df, orders_df.customer_id == customers_df.customerid, "inner").show(50)

+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|order_id|          order_date|customer_id|order_status|customerid|customer_fname|customer_lname|user_name| password|             address|    city|state|pincode|
+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|   21730|2013-12-05 00:00:...|         26|      CLOSED|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   21730|2013-12-05 00:00:...|         26|      CLOSED|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   21730|2013-12-05 00:00:...|         26|      CLOSED|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   21730|2013-12-05 00:00:.

In [7]:
orders_df.join(customers_df, orders_df.customer_id == customers_df.customerid, "left").show()

+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|order_id|          order_date|customer_id|order_status|customerid|customer_fname|customer_lname|user_name| password|             address|    city|state|pincode|
+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|   64185|2014-03-28 00:00:...|         26|     PENDING|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   64185|2014-03-28 00:00:...|         26|     PENDING|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   64185|2014-03-28 00:00:...|         26|     PENDING|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   64185|2014-03-28 00:00:.

In [8]:
orders_df.join(customers_df, orders_df.customer_id == customers_df.customerid, "right").show()

+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|order_id|          order_date|customer_id|order_status|customerid|customer_fname|customer_lname|user_name| password|             address|    city|state|pincode|
+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|   21730|2013-12-05 00:00:...|         26|      CLOSED|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   26441|2014-01-05 00:00:...|         26|    COMPLETE|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   30480|2014-01-30 00:00:...|         26|     PENDING|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   34653|2014-02-23 00:00:.

In [9]:
orders_df.join(customers_df, orders_df.customer_id == customers_df.customerid, "full").show(50)

+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|order_id|          order_date|customer_id|order_status|customerid|customer_fname|customer_lname|user_name| password|             address|    city|state|pincode|
+--------+--------------------+-----------+------------+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|   64185|2014-03-28 00:00:...|         26|     PENDING|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   64185|2014-03-28 00:00:...|         26|     PENDING|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   64185|2014-03-28 00:00:...|         26|     PENDING|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|Glenview|   IL|  60025|
|   64185|2014-03-28 00:00:.

In [10]:
orders_df.join(customers_df, orders_df.customer_id == customers_df.customerid, "full").count()

361636170

In [11]:
customers_df.join(orders_df, orders_df.customer_id == customers_df.customerid, "semi").show()

+----------+--------------+--------------+---------+---------+--------------------+----------+-----+-------+
|customerid|customer_fname|customer_lname|user_name| password|             address|      city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+----------+-----+-------+
|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|  Glenview|   IL|  60025|
|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|  Glenview|   IL|  60025|
|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|  Glenview|   IL|  60025|
|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|  Glenview|   IL|  60025|
|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|  Glenview|   IL|  60025|
|        26|        Johnny|          Hood|XXXXXXXXX|XXXXXXXXX|9576 Middle Hills...|  Glenview|   IL|  60025|
|        26|       

In [12]:
customers_df.join(orders_df, orders_df.customer_id == customers_df.customerid, "anti").show()

+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|customerid|customer_fname|customer_lname|user_name| password|             address|    city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+--------+-----+-------+
|     10913|          Mary|      Williams|XXXXXXXXX|XXXXXXXXX|9113 Grand Hills ...|San Jose|   CA|  95123|
|     10913|          Mary|      Williams|XXXXXXXXX|XXXXXXXXX|9113 Grand Hills ...|San Jose|   CA|  95123|
|     10913|          Mary|      Williams|XXXXXXXXX|XXXXXXXXX|9113 Grand Hills ...|San Jose|   CA|  95123|
|     10913|          Mary|      Williams|XXXXXXXXX|XXXXXXXXX|9113 Grand Hills ...|San Jose|   CA|  95123|
|     10913|          Mary|      Williams|XXXXXXXXX|XXXXXXXXX|9113 Grand Hills ...|San Jose|   CA|  95123|
|     10913|          Mary|      Williams|XXXXXXXXX|XXXXXXXXX|9113 Grand Hills ...|San Jose|   CA|  95123|
|     10913|          Mary|      Will