<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/04-joins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Joins
- inner join
- left/right join
- full join
- left anti join
- cartesian product
- union/unionAll
- minus
- intersect

# Setting up PySpark

In [None]:
%pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [None]:
employee_data = [("101", "Chloe", 3),
            ("102", "Paul", 1),
            ("103", "John", 1),
            ("104", "Lisa", 2),
            ("105", "Evan", 3),
            ("106", "Amy", 3),
            ("107", "Jimmy", 5)]
dpto_data = [("1", "Engineering"), ("2", "Sales"), ("3", "Marketing"), ("4", "Finance")]

employee_columns = ["id", "name", "dpto"]
dpto_columns = ["dpto", "deptname"]

employee = sc.parallelize(employee_data).toDF(employee_columns)
dpto = sc.parallelize(dpto_data).toDF(dpto_columns)

In [None]:
# Employee's dtaframe
employee.show()

In [None]:
# Department's dtaframe
dpto.show()

# Joins

In [None]:
# Inner join - exists on both sides
employee.join(dpto, ["dpto"]).show()
# or
employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "inner").show()

In [None]:
# Left join - bring everything from the left side + what exists on right side

employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "left").show()

In [None]:
# Right join - bring everything from the right side + what exists on left side

employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "right").show()

In [None]:
# Full join - bring everything from both side

employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "full").show()

In [None]:
# left Anti Join - Bring everything from the left that don´t exist on the right
employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "left_anti").show()

# right Anti Join - Bring everything from the right that don´t exist on the left
dpto.join(employee, employee["dpto"] == dpto["dpto"], how = "left_anti").show()

### Using SQL

In [None]:
employee.createOrReplaceTempView("employee")
dpto.createOrReplaceTempView("dpto")

# bring all the employees with or without department
spark.sql("select * from employee left join dpto using (dpto)").show()

# Union / Minus / Intersect

In [None]:
employee_hr = employee

data = [(200, "George", 5), (201, "Anna", 5), (202, "Carl", 3), (101, "Chloe", 3), (103, "John", 1), (106, "Amy", 1)]
employee_erp = spark.createDataFrame(data, schema=["id", "name", "dpto"])

print("HR database")
employee_hr.show()

print("ERP database")
employee_erp.show()

In [None]:
# union / unionByName
print("Combine both dataframes")
employee_hr.unionByName(employee_erp).show() # union, unionAll

print("Get values that are common in both dataframes")
employee_hr.intersect(employee_erp).show()

print("Get only the difference - does not exist on the second dataframe")
employee_hr.exceptAll(employee_erp).show()

# Questions

In [None]:
# Q1
# Implement Cartesian Product using dataframe and SQL
# Use employee and dpto

In [None]:
# Q2
# Implement "Left Anti Join" using SQL
# Use employee and dpto