In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder \
.master("local") \
.appName("join_df2") \
.getOrCreate()

In [5]:
empData = [(1,"Smith",10), (2,"Rose",20),
    (3,"Williams",10), (4,"Jones",30)
  ]
empColumns = ["emp_id","name","emp_dept_id"]
empDF = spark.createDataFrame(empData,empColumns)

In [6]:
deptData = [("Finance",10), ("Marketing",20),
    ("Sales",30),("IT",40)
  ]
deptColumns = ["dept_name","dept_id"]
deptDF=spark.createDataFrame(deptData,deptColumns)

In [7]:
addData=[(1,"1523 Main St","SFO","CA"),
    (2,"3453 Orange St","SFO","NY"),
    (3,"34 Warner St","Jersey","NJ"),
    (4,"221 Cavalier St","Newark","DE"),
    (5,"789 Walnut St","Sandiago","CA")
  ]
addColumns = ["emp_id","addline1","city","state"]
addDF = spark.createDataFrame(addData,addColumns)

In [8]:
empDF.join(addDF, empDF.emp_id == addDF.emp_id).show()

+------+--------+-----------+------+---------------+------+-----+
|emp_id|    name|emp_dept_id|emp_id|       addline1|  city|state|
+------+--------+-----------+------+---------------+------+-----+
|     1|   Smith|         10|     1|   1523 Main St|   SFO|   CA|
|     3|Williams|         10|     3|   34 Warner St|Jersey|   NJ|
|     2|    Rose|         20|     2| 3453 Orange St|   SFO|   NY|
|     4|   Jones|         30|     4|221 Cavalier St|Newark|   DE|
+------+--------+-----------+------+---------------+------+-----+



In [9]:
empDF.join(addDF, empDF.emp_id == addDF.emp_id) \
.join(deptDF,empDF.emp_dept_id == deptDF.dept_id).show()

+------+--------+-----------+------+---------------+------+-----+---------+-------+
|emp_id|    name|emp_dept_id|emp_id|       addline1|  city|state|dept_name|dept_id|
+------+--------+-----------+------+---------------+------+-----+---------+-------+
|     1|   Smith|         10|     1|   1523 Main St|   SFO|   CA|  Finance|     10|
|     3|Williams|         10|     3|   34 Warner St|Jersey|   NJ|  Finance|     10|
|     4|   Jones|         30|     4|221 Cavalier St|Newark|   DE|    Sales|     30|
|     2|    Rose|         20|     2| 3453 Orange St|   SFO|   NY|Marketing|     20|
+------+--------+-----------+------+---------------+------+-----+---------+-------+



In [10]:
empDF.join(addDF).where(empDF.emp_id == addDF.emp_id) \
.join(deptDF).where(empDF.emp_dept_id == deptDF.dept_id).show()

+------+--------+-----------+------+---------------+------+-----+---------+-------+
|emp_id|    name|emp_dept_id|emp_id|       addline1|  city|state|dept_name|dept_id|
+------+--------+-----------+------+---------------+------+-----+---------+-------+
|     1|   Smith|         10|     1|   1523 Main St|   SFO|   CA|  Finance|     10|
|     3|Williams|         10|     3|   34 Warner St|Jersey|   NJ|  Finance|     10|
|     4|   Jones|         30|     4|221 Cavalier St|Newark|   DE|    Sales|     30|
|     2|    Rose|         20|     2| 3453 Orange St|   SFO|   NY|Marketing|     20|
+------+--------+-----------+------+---------------+------+-----+---------+-------+



In [11]:
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
addDF.createOrReplaceTempView("ADD")

In [12]:
spark.sql("""select * from EMP e, DEPT d, ADD a 
where e.emp_id=a.emp_id and e.emp_dept_id=d.dept_id""").show()

+------+--------+-----------+---------+-------+------+---------------+------+-----+
|emp_id|    name|emp_dept_id|dept_name|dept_id|emp_id|       addline1|  city|state|
+------+--------+-----------+---------+-------+------+---------------+------+-----+
|     1|   Smith|         10|  Finance|     10|     1|   1523 Main St|   SFO|   CA|
|     3|Williams|         10|  Finance|     10|     3|   34 Warner St|Jersey|   NJ|
|     2|    Rose|         20|Marketing|     20|     2| 3453 Orange St|   SFO|   NY|
|     4|   Jones|         30|    Sales|     30|     4|221 Cavalier St|Newark|   DE|
+------+--------+-----------+---------+-------+------+---------------+------+-----+

