### Manipulação de Dados com DataFrames no PySpark - Joins

### Criando DataFrames de Exemplo e Tabelas Temporárias


In [0]:
spark

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import logging

# Definir esquemas para os DataFrames
schema1 = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

schema2 = StructType([
    StructField("id", IntegerType(), True),
    StructField("gender", StringType(), True)
])

# Criar DataFrames de exemplo
data1 = [(1, "Alice"), (2, "Bob"), (3, "Charlie"), (4, "David")]
data2 = [(1, "Feminino"), (2, "Masculino"), (5, "Outro")]

df1 = spark.createDataFrame(data1, schema1)
df2 = spark.createDataFrame(data2, schema2)

# Criar views temporárias para usar SQL
df1.createOrReplaceTempView("df1")
df2.createOrReplaceTempView("df2")

# Mostrar os DataFrames de exemplo
print("DataFrame 1:")
display(df1)

In [0]:
print("DataFrame 2:")
display(df2)


### Exemplos de Joins com DataFrames e Equivalente SQL

---

#### INNER JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/INNER_JOIN.drawio.png)


In [0]:
# Inner Join no PySpark
df_inner = df1.join(df2, "id", "inner")
display(df_inner)


**Equivalente em SQL**


In [0]:
%sql
SELECT df1.id, df1.name, df2.gender
FROM df1
INNER JOIN df2 ON df1.id = df2.id


---

#### LEFT JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/LEFT_JOIN.drawio.png)


In [0]:
# Left Join no PySpark
df_left = df1.join(df2, "id", "left")
display(df_left)


**Equivalente em SQL**


In [0]:
%sql
SELECT df1.id, df1.name, df2.gender
FROM df1
LEFT JOIN df2 ON df1.id = df2.id


---

#### RIGHT JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/RIGHT_JOIN.drawio.png)


In [0]:
# Right Join no PySpark
df_right = df1.join(df2, "id", "right")
display(df_right)


**Equivalente em SQL**


In [0]:
%sql
SELECT df1.id, df1.name, df2.gender
FROM df1
RIGHT JOIN df2 ON df1.id = df2.id

---

#### FULL OUTER JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/FULL_OUTER_JOIN.drawio.png)


In [0]:
# Full Outer Join no PySpark
df_full_outer = df1.join(df2, "id", "full")
display(df_full_outer)


**Equivalente em SQL**


In [0]:
%sql
SELECT df1.id, df1.name, df2.gender
FROM df1
FULL OUTER JOIN df2 ON df1.id = df2.id


---

#### LEFT SEMI JOIN


In [0]:
# Left Semi Join no PySpark
df_left_semi = df1.join(df2, "id", "left_semi")
display(df_left_semi)


**Equivalente em SQL**


In [0]:
%sql
SELECT df1.id, df1.name
FROM df1
LEFT SEMI JOIN df2 ON df1.id = df2.id


---

#### LEFT ANTI JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/LEFT_ANTI_JOIN.drawio.png)


In [0]:
# Left Anti Join no PySpark
df_left_anti = df1.join(df2, "id", "left_anti")
display(df_left_anti)


**Equivalente em SQL**


In [0]:
%sql
SELECT df1.id, df1.name
FROM df1
LEFT ANTI JOIN df2 ON df1.id = df2.id


---

#### ANTI LEFT JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/ANTI_LEFT_JOIN.drawio.png)

> Produzindo o mesmo efeito do **LEFT ANTI JOIN**


In [0]:
import pyspark.sql.functions as F
# Anti Left Join no PySpark
df_anti_left = (
  df1.alias("df1")
    .join(
        df2.alias("df2"), "id", "left")
    .filter(F.col("df2.id").isNull())
)

display(df_anti_left)


**Equivalente em SQL**


In [0]:
%sql
SELECT *
FROM df1
LEFT JOIN df2 ON df1.id = df2.id
WHERE df2.id is null


---

#### ANTI RIGHT JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/ANTI_RIGHT_JOIN.drawio.png)


In [0]:
import pyspark.sql.functions as F
# Anti Right Join no PySpark
df_anti_right = df1.alias("df1").join(df2.alias("df2"), "id", "right").filter(F.col("df1.id").isNull())
display(df_anti_right)


**Equivalente em SQL**


In [0]:
%sql
SELECT *
FROM df1
RIGHT JOIN df2 ON df1.id = df2.id
WHERE df1.id is null


---

#### ANTI OUTER JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/ANTI_OUTER_JOIN.drawio.png)


In [0]:
import pyspark.sql.functions as F
# Anti Right Join no PySpark
df_anti_outer = (
    df1.alias("df1")
    .join(df2.alias("df2"), "id", "full")
    .filter(F.col("df1.id").isNull() | F.col("df2.id").isNull())
)
display(df_anti_outer)


**Equivalente em SQL**


In [0]:
%sql
SELECT *
FROM df1
FULL OUTER JOIN df2 ON df1.id = df2.id
WHERE df1.id is null
or df2.id is null


---

#### CROSS JOIN

![INNER_JOIN.drawio.png](https://raw.githubusercontent.com/rafael-negrao/laboratorio-spark/main/imagens/CROSS_JOIN.drawio.png)



In [0]:
# Cross Join no PySpark
df_cross = df1.crossJoin(df2)
display(df_cross)


**Equivalente em SQL**


In [0]:
%sql
SELECT df1.id, df1.name, df2.id, df2.gender
FROM df1
CROSS JOIN df2

`[INFO]: FIM DO NOTEBOOK`
