In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

spark = SparkSession.builder.getOrCreate()

# Patients DataFrame
patients_data = [
    (1, "Alice Smith", 28),
    (2, "Bob Johnson", 35),
    (3, "Carol Davis", 42),
    (4, "David Wilson", 31),
    (5, "Emma Brown", 29)
]

patients_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("patient_name", StringType(), True),
    StructField("age", IntegerType(), True)
])

patients = spark.createDataFrame(patients_data, schema=patients_schema)

# Covid Tests DataFrame
covid_tests_data = [
    (1, 1, "2023-01-15", "Positive"),
    (2, 1, "2023-01-25", "Negative"),
    (3, 2, "2023-02-01", "Positive"),
    (4, 2, "2023-02-05", "Inconclusive"),
    (5, 2, "2023-02-12", "Negative"),
    (6, 3, "2023-01-20", "Negative"),
    (7, 3, "2023-02-10", "Positive"),
    (8, 3, "2023-02-20", "Negative"),
    (9, 4, "2023-01-10", "Positive"),
    (10, 4, "2023-01-18", "Positive"),
    (11, 5, "2023-02-15", "Negative"),
    (12, 5, "2023-02-20", "Negative")
]

covid_tests_schema = StructType([
    StructField("test_id", IntegerType(), True),
    StructField("patient_id", IntegerType(), True),
    StructField("test_date", StringType(), True),  # You can cast this to DateType if needed
    StructField("result", StringType(), True)
])

covid_tests = spark.createDataFrame(covid_tests_data, schema=covid_tests_schema)


In [0]:
patients.show()
covid_tests.show()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [0]:
positive = covid_tests.filter(col('result')=='Positive')
negative = covid_tests.filter('result=="Negative"')

first_recovered = positive.join(negative, on='patient_id', how='inner')\
    .filter(positive.test_date < negative.test_date)\
    .withColumn('rn',row_number().over(Window.partitionBy('patient_id').orderBy(desc(positive.test_date))))\
    .filter('rn==1')\
    .withColumn('recovery_time',datediff(negative.test_date, positive.test_date))\
    .select('patient_id','recovery_time')

first_recovered.alias('r').join(patients.alias('p'), on='patient_id', how='inner')\
    .select(col('r.patient_id'),'patient_name','age', 'recovery_time')\
    .orderBy('recovery_time','patient_name')\
    .show()
