## Task

Given a dataset with columns PERSON, TYPE, and AGE, create an output where the oldest adult is paired with the youngest child, producing pairs of ADULT and CHILD while ensuring appropriate data matching.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create SparkSession
spark = SparkSession.builder \
    .appName("AdultChildPairs") \
    .getOrCreate()

In [2]:
# Define input data
data = [
    ("A1", "ADULT", 54),
    ("A2", "ADULT", 53),
    ("A3", "ADULT", 52),
    ("A4", "ADULT", 58),
    ("A5", "ADULT", 54),
    ("C1", "CHILD", 20),
    ("C2", "CHILD", 19),
    ("C3", "CHILD", 22),
    ("C4", "CHILD", 15)
]

# Create DataFrame
columns = ["PERSON", "TYPE", "AGE"]
df = spark.createDataFrame(data, columns)

In [6]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

window_spec_desc = Window.orderBy(col('AGE').desc())
window_spec_asc = Window.orderBy(col('AGE').asc())

df_child = df.filter(col('TYPE')=='CHILD').withColumn('row_number',row_number().over(window_spec_asc))
df_child.show()

df_adult = df.filter(col('TYPE')=='ADULT').withColumn('row_number',row_number().over(window_spec_desc))
df_adult.show()

+------+-----+---+----------+
|PERSON| TYPE|AGE|row_number|
+------+-----+---+----------+
|    C4|CHILD| 15|         1|
|    C2|CHILD| 19|         2|
|    C1|CHILD| 20|         3|
|    C3|CHILD| 22|         4|
+------+-----+---+----------+

+------+-----+---+----------+
|PERSON| TYPE|AGE|row_number|
+------+-----+---+----------+
|    A4|ADULT| 58|         1|
|    A1|ADULT| 54|         2|
|    A5|ADULT| 54|         3|
|    A2|ADULT| 53|         4|
|    A3|ADULT| 52|         5|
+------+-----+---+----------+



In [8]:
df_pairs = df_adult.join(df_child,on='row_number',how = 'full_outer').drop('row_number')

df_pairs.show()

+------+-----+---+------+-----+----+
|PERSON| TYPE|AGE|PERSON| TYPE| AGE|
+------+-----+---+------+-----+----+
|    A4|ADULT| 58|    C4|CHILD|  15|
|    A1|ADULT| 54|    C2|CHILD|  19|
|    A5|ADULT| 54|    C1|CHILD|  20|
|    A2|ADULT| 53|    C3|CHILD|  22|
|    A3|ADULT| 52|  NULL| NULL|NULL|
+------+-----+---+------+-----+----+

