In [3]:

from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql.functions import when, udf, col, regexp_replace
from pyspark.sql.types import DoubleType,IntegerType, StringType 

In [4]:
sc = SparkContext( 'local' )  
sqlCtx = SQLContext( sc )

In [5]:
emp   =[('홍길동',1),('이순신',2),
        ('임꺽정',3),('김철수',3),('김철수1',5)]
dept  = [('개발',1), ('연구',2),
         ('영업',3),('기획',4) ]
empA  = sqlCtx.createDataFrame( emp, ['name','deptid'] )
deptB = sqlCtx.createDataFrame( dept,['deptname','deptid'] )

In [6]:
empA.show()

+-------+------+
|   name|deptid|
+-------+------+
| 홍길동|     1|
| 이순신|     2|
| 임꺽정|     3|
| 김철수|     3|
|김철수1|     5|
+-------+------+



In [7]:
deptB.show()

+--------+------+
|deptname|deptid|
+--------+------+
|    개발|     1|
|    연구|     2|
|    영업|     3|
|    기획|     4|
+--------+------+



In [12]:
empA.join(deptB, on=['deptid']).show()

+------+------+--------+
|deptid|  name|deptname|
+------+------+--------+
|     1|홍길동|    개발|
|     3|임꺽정|    영업|
|     3|김철수|    영업|
|     2|이순신|    연구|
+------+------+--------+



In [11]:
empA.join(deptB, on=['deptid'], how='inner').show()

+------+------+--------+
|deptid|  name|deptname|
+------+------+--------+
|     1|홍길동|    개발|
|     3|임꺽정|    영업|
|     3|김철수|    영업|
|     2|이순신|    연구|
+------+------+--------+



In [15]:
empA.join(deptB, on=['deptid'], how='left').show()

+------+-------+--------+
|deptid|   name|deptname|
+------+-------+--------+
|     5|김철수1|    null|
|     1| 홍길동|    개발|
|     3| 임꺽정|    영업|
|     3| 김철수|    영업|
|     2| 이순신|    연구|
+------+-------+--------+



In [16]:
empA.join(deptB, on=['deptid'], how='right').show()

+------+------+--------+
|deptid|  name|deptname|
+------+------+--------+
|     1|홍길동|    개발|
|     3|임꺽정|    영업|
|     3|김철수|    영업|
|     2|이순신|    연구|
|     4|  null|    기획|
+------+------+--------+



In [17]:
empA.join(deptB, on=['deptid'], how='full').show()

+------+-------+--------+
|deptid|   name|deptname|
+------+-------+--------+
|     5|김철수1|    null|
|     1| 홍길동|    개발|
|     3| 임꺽정|    영업|
|     3| 김철수|    영업|
|     2| 이순신|    연구|
|     4|   null|    기획|
+------+-------+--------+



In [18]:
testA  = [('A',1),('B',2),('C',3),('D',4)]
testB  = [('E',1),('A',2),('C',3),('F',4)]
testAA = sqlCtx.createDataFrame( testA, ['name','id'] )
testBB = sqlCtx.createDataFrame( testB,['name', 'myid'] )
testAA.show()
testBB.show()

+----+---+
|name| id|
+----+---+
|   A|  1|
|   B|  2|
|   C|  3|
|   D|  4|
+----+---+

+----+----+
|name|myid|
+----+----+
|   E|   1|
|   A|   2|
|   C|   3|
|   F|   4|
+----+----+



In [19]:
testAA.join(testBB, on=['name']).show()

+----+---+----+
|name| id|myid|
+----+---+----+
|   C|  3|   3|
|   A|  1|   2|
+----+---+----+



In [20]:
testAA.join(testBB, on=['name'], how='left').show()

+----+---+----+
|name| id|myid|
+----+---+----+
|   B|  2|null|
|   D|  4|null|
|   C|  3|   3|
|   A|  1|   2|
+----+---+----+



In [21]:
testAA.join(testBB, on=['name'], how='right').show()

+----+----+----+
|name|  id|myid|
+----+----+----+
|   F|null|   4|
|   E|null|   1|
|   C|   3|   3|
|   A|   1|   2|
+----+----+----+



In [29]:
dd = testAA.join(testBB, on=['name'], how='full')
dd = dd.fillna({'id':100,'myid':200})
# dd = dd.dropna(subset=['id'])
dd.show()

+----+---+----+
|name| id|myid|
+----+---+----+
|   F|100|   4|
|   E|100|   1|
|   B|  2| 200|
|   D|  4| 200|
|   C|  3|   3|
|   A|  1|   2|
+----+---+----+



In [30]:
testAA.unionAll( testBB ).show()

+----+---+
|name| id|
+----+---+
|   A|  1|
|   B|  2|
|   C|  3|
|   D|  4|
|   E|  1|
|   A|  2|
|   C|  3|
|   F|  4|
+----+---+

