In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

# Data frame Joining example

In [2]:
from pyspark.sql.types import StringType, StructField, IntegerType, StructType
# Create a list of tuples with sample employee data
employee_data = [
    ("1", "Alice", 30, "Engineering"),
    ("2", "Bob", 25, "Engineering"),
    ("3", "Charlie", 35, "HR"),
    ("4", "David", 28, "Finance"),
    ("5", "Eve", 22, "HR"),
]

# Define the schema for the employee DataFrame
employee_schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), True)
])

# Create the employee DataFrame
employee_df = spark.createDataFrame(employee_data, employee_schema)

# Create a list of tuples with department data
department_data = [
    ("Engineering", "New York"),
    ("HR", "San Francisco"),
    ("Finance", "Los Angeles"),
    # ... Add more department data if needed
]

# Define the schema for the department DataFrame
department_schema = StructType([
    StructField("Department", StringType(), True),
    StructField("Location", StringType(), True)
])
department_df = spark.createDataFrame(department_data, department_schema)

employee_df.show()
department_df.show()

+----------+-------+---+-----------+
|EmployeeID|   Name|Age| Department|
+----------+-------+---+-----------+
|         1|  Alice| 30|Engineering|
|         2|    Bob| 25|Engineering|
|         3|Charlie| 35|         HR|
|         4|  David| 28|    Finance|
|         5|    Eve| 22|         HR|
+----------+-------+---+-----------+

+-----------+-------------+
| Department|     Location|
+-----------+-------------+
|Engineering|     New York|
|         HR|San Francisco|
|    Finance|  Los Angeles|
+-----------+-------------+



# Join the Dataframes
When we join dataframes that have the columns with the same nama then we can have a problem with the selection of the particular column. 


In [3]:
emp_dept = employee_df.join(department_df, employee_df.Department == department_df.Department, 'right')
emp_dept.show()

+----------+-------+---+-----------+-----------+-------------+
|EmployeeID|   Name|Age| Department| Department|     Location|
+----------+-------+---+-----------+-----------+-------------+
|         1|  Alice| 30|Engineering|Engineering|     New York|
|         2|    Bob| 25|Engineering|Engineering|     New York|
|         4|  David| 28|    Finance|    Finance|  Los Angeles|
|         3|Charlie| 35|         HR|         HR|San Francisco|
|         5|    Eve| 22|         HR|         HR|San Francisco|
+----------+-------+---+-----------+-----------+-------------+



The following code will rise the "AnalysisException: Reference 'Department' is ambiguous, could be: Department, Department." exception. 

In [4]:
emp_dept.select('Name', 'Department', 'Location').show()

AnalysisException: Reference 'Department' is ambiguous, could be: Department, Department.

To avoid this exception we can use the following code that adds aliases to each dataframe, so we can select the columns by the alias name. 

In [5]:
emp_dept = employee_df.alias('emp').join(department_df.alias('dept'), employee_df.Department == department_df.Department, 'right')
emp_dept.show()

+----------+-------+---+-----------+-----------+-------------+
|EmployeeID|   Name|Age| Department| Department|     Location|
+----------+-------+---+-----------+-----------+-------------+
|         1|  Alice| 30|Engineering|Engineering|     New York|
|         2|    Bob| 25|Engineering|Engineering|     New York|
|         4|  David| 28|    Finance|    Finance|  Los Angeles|
|         3|Charlie| 35|         HR|         HR|San Francisco|
|         5|    Eve| 22|         HR|         HR|San Francisco|
+----------+-------+---+-----------+-----------+-------------+



Use the alias dept to access the departments dataframe and alias emp to access the employees dataframe.

In [6]:
emp_dept.select('Name', 'dept.Department', 'emp.Department','dept.Location').show()

+-------+-----------+-----------+-------------+
|   Name| Department| Department|     Location|
+-------+-----------+-----------+-------------+
|  Alice|Engineering|Engineering|     New York|
|    Bob|Engineering|Engineering|     New York|
|  David|    Finance|    Finance|  Los Angeles|
|Charlie|         HR|         HR|San Francisco|
|    Eve|         HR|         HR|San Francisco|
+-------+-----------+-----------+-------------+

