#Belajar Pyspark - Join

In [29]:
%pip install pyspark #jalankan untuk google colab



In [30]:
from pyspark.sql import SparkSession

In [31]:
spark = SparkSession.builder.appName("Belajar PySpark - Grouping dan Agregasi").getOrCreate()

#Dataframe yang digunakan

In [32]:
data1 = [['Agus','Fisika',130],
        ['Budi','Biologi',200],
        ['Bayu','Manajemen',180],
        ['Dedi','Akuntansi',50]]

kolom1 = ["nama","jurusan","nilai"]

df1 = spark.createDataFrame(data1,kolom1)
df1.show()

+----+---------+-----+
|nama|  jurusan|nilai|
+----+---------+-----+
|Agus|   Fisika|  130|
|Budi|  Biologi|  200|
|Bayu|Manajemen|  180|
|Dedi|Akuntansi|   50|
+----+---------+-----+



In [67]:
data2 = [['Biologi','BIO','MIPA'],
         ['Fisika','FIS','MIPA'],
         ['Bisnis','BIS','Ekonomi'],
         ['Manajemen','MAN','Ekonomi']]

kolom2 = ["jurusan","kode_jurusan","fakultas"]

df2 = spark.createDataFrame(data2,kolom2)
df2.show()

+---------+------------+--------+
|  jurusan|kode_jurusan|fakultas|
+---------+------------+--------+
|  Biologi|         BIO|    MIPA|
|   Fisika|         FIS|    MIPA|
|   Bisnis|         BIS| Ekonomi|
|Manajemen|         MAN| Ekonomi|
+---------+------------+--------+



#Inner Join

In [34]:
inner = df1.join(df2,df1.jurusan==df2.jurusan)
inner.show()

+----+---------+-----+---------+------------+--------+
|nama|  jurusan|nilai|  jurusan|kode_jurusan|fakultas|
+----+---------+-----+---------+------------+--------+
|Budi|  Biologi|  200|  Biologi|         BIO|    MIPA|
|Agus|   Fisika|  130|   Fisika|         FIS|    MIPA|
|Bayu|Manajemen|  180|Manajemen|         MAN| Ekonomi|
+----+---------+-----+---------+------------+--------+



In [27]:
inner["nama","jurusan"].show()

AnalysisException: ignored

##Menghilangkan kolom duplikat

In [28]:
df1.join(df2, ["jurusan"], "inner").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|  Biologi|Budi|  200|         BIO|    MIPA|
|   Fisika|Agus|  130|         FIS|    MIPA|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
+---------+----+-----+------------+--------+



##Join dengan lebih dari 1 key

In [72]:
data1 = [['Agus','Fisika','MIPA',130],
        ['Budi','Kimia','MIPA',200],
        ['Bayu','Mesin','Teknik',180],
        ['Dedi','Kimia','Teknik',50]]
kolom1 = ["nama","jurusan","fakultas", "nilai"]

data2 = [['Kimia','KIM','MIPA'],
         ['Fisika','FIS','MIPA'],
         ['Kimia','TKIM','Teknik'],
         ['Mesin','TMSN','Teknik']]

kolom2 = ["jurusan","kode_jurusan","fakultas"]

dfa = spark.createDataFrame(data1,kolom1)
dfb = spark.createDataFrame(data2,kolom2)

In [76]:
dfa.join(dfb,(dfa.jurusan==dfb.jurusan) &
              (dfa.fakultas==dfb.fakultas),"inner").show()

+----+-------+--------+-----+-------+------------+--------+
|nama|jurusan|fakultas|nilai|jurusan|kode_jurusan|fakultas|
+----+-------+--------+-----+-------+------------+--------+
|Agus| Fisika|    MIPA|  130| Fisika|         FIS|    MIPA|
|Budi|  Kimia|    MIPA|  200|  Kimia|         KIM|    MIPA|
|Dedi|  Kimia|  Teknik|   50|  Kimia|        TKIM|  Teknik|
|Bayu|  Mesin|  Teknik|  180|  Mesin|        TMSN|  Teknik|
+----+-------+--------+-----+-------+------------+--------+



In [74]:
dfa.join(dfb, ["jurusan","fakultas"]).show()

+-------+--------+----+-----+------------+
|jurusan|fakultas|nama|nilai|kode_jurusan|
+-------+--------+----+-----+------------+
| Fisika|    MIPA|Agus|  130|         FIS|
|  Kimia|    MIPA|Budi|  200|         KIM|
|  Kimia|  Teknik|Dedi|   50|        TKIM|
|  Mesin|  Teknik|Bayu|  180|        TMSN|
+-------+--------+----+-----+------------+



#Left Outer Join

In [40]:
df1.join(df2, ["jurusan"],"left").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|   Fisika|Agus|  130|         FIS|    MIPA|
|  Biologi|Budi|  200|         BIO|    MIPA|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
|Akuntansi|Dedi|   50|        null|    null|
+---------+----+-----+------------+--------+



#Right Outer Join

In [44]:
df1.join(df2, ["jurusan"],"right").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|   Fisika|Agus|  130|         FIS|    MIPA|
|  Biologi|Budi|  200|         BIO|    MIPA|
|   Bisnis|null| null|         BIS| Ekonomi|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
+---------+----+-----+------------+--------+



#Full Outer Join

In [46]:
df1.join(df2, ["jurusan"],"full").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|Akuntansi|Dedi|   50|        null|    null|
|  Biologi|Budi|  200|         BIO|    MIPA|
|   Bisnis|null| null|         BIS| Ekonomi|
|   Fisika|Agus|  130|         FIS|    MIPA|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
+---------+----+-----+------------+--------+



#Cross Join

In [53]:
df1.join(df2, ["jurusan"],"cross").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|  Biologi|Budi|  200|         BIO|    MIPA|
|   Fisika|Agus|  130|         FIS|    MIPA|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
+---------+----+-----+------------+--------+



In [54]:
df1.join(df2).show()

+----+---------+-----+---------+------------+--------+
|nama|  jurusan|nilai|  jurusan|kode_jurusan|fakultas|
+----+---------+-----+---------+------------+--------+
|Agus|   Fisika|  130|  Biologi|         BIO|    MIPA|
|Agus|   Fisika|  130|   Fisika|         FIS|    MIPA|
|Budi|  Biologi|  200|  Biologi|         BIO|    MIPA|
|Budi|  Biologi|  200|   Fisika|         FIS|    MIPA|
|Agus|   Fisika|  130|   Bisnis|         BIS| Ekonomi|
|Agus|   Fisika|  130|Manajemen|         MAN| Ekonomi|
|Budi|  Biologi|  200|   Bisnis|         BIS| Ekonomi|
|Budi|  Biologi|  200|Manajemen|         MAN| Ekonomi|
|Bayu|Manajemen|  180|  Biologi|         BIO|    MIPA|
|Bayu|Manajemen|  180|   Fisika|         FIS|    MIPA|
|Dedi|Akuntansi|   50|  Biologi|         BIO|    MIPA|
|Dedi|Akuntansi|   50|   Fisika|         FIS|    MIPA|
|Bayu|Manajemen|  180|   Bisnis|         BIS| Ekonomi|
|Bayu|Manajemen|  180|Manajemen|         MAN| Ekonomi|
|Dedi|Akuntansi|   50|   Bisnis|         BIS| Ekonomi|
|Dedi|Akun

In [55]:
df1.crossJoin(df2).show()

+----+---------+-----+---------+------------+--------+
|nama|  jurusan|nilai|  jurusan|kode_jurusan|fakultas|
+----+---------+-----+---------+------------+--------+
|Agus|   Fisika|  130|  Biologi|         BIO|    MIPA|
|Agus|   Fisika|  130|   Fisika|         FIS|    MIPA|
|Budi|  Biologi|  200|  Biologi|         BIO|    MIPA|
|Budi|  Biologi|  200|   Fisika|         FIS|    MIPA|
|Agus|   Fisika|  130|   Bisnis|         BIS| Ekonomi|
|Agus|   Fisika|  130|Manajemen|         MAN| Ekonomi|
|Budi|  Biologi|  200|   Bisnis|         BIS| Ekonomi|
|Budi|  Biologi|  200|Manajemen|         MAN| Ekonomi|
|Bayu|Manajemen|  180|  Biologi|         BIO|    MIPA|
|Bayu|Manajemen|  180|   Fisika|         FIS|    MIPA|
|Dedi|Akuntansi|   50|  Biologi|         BIO|    MIPA|
|Dedi|Akuntansi|   50|   Fisika|         FIS|    MIPA|
|Bayu|Manajemen|  180|   Bisnis|         BIS| Ekonomi|
|Bayu|Manajemen|  180|Manajemen|         MAN| Ekonomi|
|Dedi|Akuntansi|   50|   Bisnis|         BIS| Ekonomi|
|Dedi|Akun

#Duplikasi hasil

In [58]:
data_dup = [['Agus','Fisika',130],
        ['Budi','Biologi',200],
        ['Bayu','Manajemen',180],
        ['Dedi','Akuntansi',50],
        ['Andi','Manajemen',80]]

kolom = ["nama","jurusan","nilai"]

df_dup = spark.createDataFrame(data_dup,kolom)
df_dup.show()

dataref_dup = [['Biologi','BIO','MIPA'],
         ['Fisika','FIS','MIPA'],
         ['Bisnis','BIS','Ekonomi'],
         ['Manajemen','MAN','Ekonomi'],
         ['Manajemen','MNG','Ekonomi']]

kolom = ["jurusan","kode_jurusan","fakultas"]

ref_dup = spark.createDataFrame(dataref_dup,kolom)
ref_dup.show()

+----+---------+-----+
|nama|  jurusan|nilai|
+----+---------+-----+
|Agus|   Fisika|  130|
|Budi|  Biologi|  200|
|Bayu|Manajemen|  180|
|Dedi|Akuntansi|   50|
|Andi|Manajemen|   80|
+----+---------+-----+

+---------+------------+--------+
|  jurusan|kode_jurusan|fakultas|
+---------+------------+--------+
|  Biologi|         BIO|    MIPA|
|   Fisika|         FIS|    MIPA|
|   Bisnis|         BIS| Ekonomi|
|Manajemen|         MAN| Ekonomi|
|Manajemen|         MNG| Ekonomi|
+---------+------------+--------+



In [60]:
df_dup.join(ref_dup, ["jurusan"], "inner").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|  Biologi|Budi|  200|         BIO|    MIPA|
|   Fisika|Agus|  130|         FIS|    MIPA|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
|Manajemen|Bayu|  180|         MNG| Ekonomi|
|Manajemen|Andi|   80|         MAN| Ekonomi|
|Manajemen|Andi|   80|         MNG| Ekonomi|
+---------+----+-----+------------+--------+



In [61]:
df_dup.join(ref_dup, ["jurusan"], "left").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|   Fisika|Agus|  130|         FIS|    MIPA|
|  Biologi|Budi|  200|         BIO|    MIPA|
|Manajemen|Bayu|  180|         MNG| Ekonomi|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
|Manajemen|Andi|   80|         MNG| Ekonomi|
|Manajemen|Andi|   80|         MAN| Ekonomi|
|Akuntansi|Dedi|   50|        null|    null|
+---------+----+-----+------------+--------+



In [66]:
df_dup.join(ref_dup, ["jurusan"], "right").show()

+---------+----+-----+------------+--------+
|  jurusan|nama|nilai|kode_jurusan|fakultas|
+---------+----+-----+------------+--------+
|   Fisika|Agus|  130|         FIS|    MIPA|
|  Biologi|Budi|  200|         BIO|    MIPA|
|   Bisnis|null| null|         BIS| Ekonomi|
|Manajemen|Andi|   80|         MAN| Ekonomi|
|Manajemen|Bayu|  180|         MAN| Ekonomi|
|Manajemen|Andi|   80|         MNG| Ekonomi|
|Manajemen|Bayu|  180|         MNG| Ekonomi|
+---------+----+-----+------------+--------+



#Semi Join

In [48]:
df1.join(df2, ["jurusan"],"semi").show()

+---------+----+-----+
|  jurusan|nama|nilai|
+---------+----+-----+
|  Biologi|Budi|  200|
|   Fisika|Agus|  130|
|Manajemen|Bayu|  180|
+---------+----+-----+



In [63]:
df_dup.join(ref_dup, ["jurusan"], "semi").show()

+---------+----+-----+
|  jurusan|nama|nilai|
+---------+----+-----+
|  Biologi|Budi|  200|
|   Fisika|Agus|  130|
|Manajemen|Bayu|  180|
|Manajemen|Andi|   80|
+---------+----+-----+



#Anti Join

In [51]:
df1.join(df2, ["jurusan"],"anti").show()

+---------+----+-----+
|  jurusan|nama|nilai|
+---------+----+-----+
|Akuntansi|Dedi|   50|
+---------+----+-----+



In [65]:
df_dup.join(ref_dup, ["jurusan"], "anti").show()

+---------+----+-----+
|  jurusan|nama|nilai|
+---------+----+-----+
|Akuntansi|Dedi|   50|
+---------+----+-----+

