In [26]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-08-join")\
    .getOrCreate()

In [27]:
spark

In [28]:
A = spark.createDataFrame([('a',1),('b',2),('c',3)],schema=['X1','X2'])

B = spark.createDataFrame([('a',1),('b',2),('d',4)],schema=['X1','X3'])

In [29]:
A.show(), B.show()

+---+---+
| X1| X2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+

+---+---+
| X1| X3|
+---+---+
|  a|  1|
|  b|  2|
|  d|  4|
+---+---+



(None, None)

In [30]:
A.join(B,'X1',how='inner').orderBy('X1', ascending=True).show()

+---+---+---+
| X1| X2| X3|
+---+---+---+
|  a|  1|  1|
|  b|  2|  2|
+---+---+---+



In [31]:
A.join(B,'X1',how='full').orderBy('X1', ascending=True).show()

+---+----+----+
| X1|  X2|  X3|
+---+----+----+
|  a|   1|   1|
|  b|   2|   2|
|  c|   3|null|
|  d|null|   4|
+---+----+----+



In [32]:
A.join(B,'X1',how='left').orderBy('X1', ascending=True).show()

+---+---+----+
| X1| X2|  X3|
+---+---+----+
|  a|  1|   1|
|  b|  2|   2|
|  c|  3|null|
+---+---+----+



In [33]:
A.join(B,'X1',how='left_semi').orderBy('X1', ascending=True).show()

+---+---+
| X1| X2|
+---+---+
|  a|  1|
|  b|  2|
+---+---+



In [34]:
A.join(B,'X1',how='left_anti').orderBy('X1', ascending=True).show()

+---+---+
| X1| X2|
+---+---+
|  c|  3|
+---+---+



In [35]:
A.join(B,'X1',how='right').orderBy('X1', ascending=True).show()

+---+----+---+
| X1|  X2| X3|
+---+----+---+
|  a|   1|  1|
|  b|   2|  2|
|  d|null|  4|
+---+----+---+



In [36]:
A.crossJoin(B).show()

+---+---+---+---+
| X1| X2| X1| X3|
+---+---+---+---+
|  a|  1|  a|  1|
|  a|  1|  b|  2|
|  a|  1|  d|  4|
|  b|  2|  a|  1|
|  b|  2|  b|  2|
|  b|  2|  d|  4|
|  c|  3|  a|  1|
|  c|  3|  b|  2|
|  c|  3|  d|  4|
+---+---+---+---+



In [14]:
Y = spark.createDataFrame([('a',1),('b',2),('c',3)],schema=['X1','X2'])

Z = spark.createDataFrame([('b',2),('c',3),('d',4)],schema=['X1','X2'])

In [15]:
Y.show()
Z.show()

+---+---+
| X1| X2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+

+---+---+
| X1| X2|
+---+---+
|  b|  2|
|  c|  3|
|  d|  4|
+---+---+



In [20]:
Y.union(Z).orderBy('X1').show()

+---+---+
| X1| X2|
+---+---+
|  a|  1|
|  b|  2|
|  b|  2|
|  c|  3|
|  c|  3|
|  d|  4|
+---+---+



In [21]:
Y.union(Z).dropDuplicates().orderBy('X1').show()

+---+---+
| X1| X2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
|  d|  4|
+---+---+



In [22]:
Y.intersect(Z).orderBy('X1').show()

+---+---+
| X1| X2|
+---+---+
|  b|  2|
|  c|  3|
+---+---+



In [23]:
Y.subtract(Z).orderBy('X1').show()

+---+---+
| X1| X2|
+---+---+
|  a|  1|
+---+---+



In [24]:
Z.subtract(Y).orderBy('X1').show()

+---+---+
| X1| X2|
+---+---+
|  d|  4|
+---+---+



In [25]:
spark.stop()