In [0]:

from pyspark.sql.types import StructType,StructField, StringType, IntegerType,DayTimeIntervalType,TimestampType
from pyspark.sql import functions as SF

# Defining Columns
call_category_col = "call_category"
policy_holder_col = "policy_holder_id"
call_received_col = "call_received"
date_format = "MM-dd-yyyy HH:mm"

# Sample Data 
callers_data = [(50837000,"claims","03-09-2022 02:51"),(50837000,"IT_support","03-12-2022 05:37"),\
              (50837000,"benefits","05-13-2022 18:19"),(50936674,"claims","05-31-2022 07:27"),\
              (50886837,"IT_support","03-11-2022 03:38"),(50886837,"","03-19-2022 10:52")]


# Schema creations
mySchema = StructType([ StructField(policy_holder_col, IntegerType(), True),\
                        StructField(call_category_col, StringType(), True),\
                        StructField(call_received_col, StringType(), True)])

#Creating Spark DataFrame
callers_df = spark.createDataFrame(data=users_data,schema=mySchema)

# Data Cleaning
call_received = SF.to_timestamp(call_received_col, date_format).cast(TimestampType())
callers_df = users_df.withColumn(call_received_col, call_received)
callers_df.show()
callers_df.printSchema()

+----------------+-------------+-------------------+
|policy_holder_id|call_category|      call_received|
+----------------+-------------+-------------------+
|        50837000|       claims|2022-03-09 02:51:00|
|        50837000|   IT_support|2022-03-12 05:37:00|
|        50837000|     benefits|2022-05-13 18:19:00|
|        50936674|       claims|2022-05-31 07:27:00|
|        50886837|   IT_support|2022-03-11 03:38:00|
|        50886837|             |2022-03-19 10:52:00|
+----------------+-------------+-------------------+

root
 |-- policy_holder_id: integer (nullable = true)
 |-- call_category: string (nullable = true)
 |-- call_received: timestamp (nullable = true)



In [0]:
#
days_col = "days"
join_type = "inner"
final_col = "patient_count"
policy_holder_id2 = "policy_holder_id2"
call_received_col2 = "call_received2"
callers_df2 = users_df
callers_df2 = callers_df2.select(SF.col(policy_holder_col).alias(policy_holder_id2),
                          call_category_col,
                          SF.col(call_received_col).alias(call_received_col2))

join_cond1 = users_df[policy_holder_col] == callers_df2[policy_holder_id2]
join_cond2 = callers_df2[call_received_col2] > users_df[call_received_col]
dateDiff = SF.datediff(SF.col(call_received_col2),SF.col(call_received_col))

result_df = users_df.join(callers_df2,on=[join_cond1,join_cond2],how = join_type)\
                    .withColumn(days_col, dateDiff)\
                    .filter(SF.col(days_col) <7)\
                    .agg(SF.countDistinct(col(policy_holder_col)).alias(final_col)).show()

+-------------+
|patient_count|
+-------------+
|            1|
+-------------+

