# This notebook combines all the individual feature sets to the main HCP target spine dataset to create a master table containing all the features. 
### This dataset will go to machine learning model input as it will contain HCPs with their corresponding target class flag and features

In [0]:
# Importing packages
from pyspark.sql import functions as F  # Importing functions from pyspark.sql
import pandas as pd

In [0]:
%run "../00_config/set-up"

In [0]:
# Month and Date parameters for manual control
first_month = "2019-12"
last_month = "2024-11"

study_period_start_date = "2023-01-01"
study_period_start_month = "2023-01"
study_period_end_date = "2024-11-30"
study_period_end_month = "2024-11"

In [0]:
def filter_rows_with_nulls(df):
    """
    Filters rows in the given DataFrame where any column contains a null value.

    Args:
    df (DataFrame): The input Spark DataFrame to filter.

    Returns:
    DataFrame: A new DataFrame containing only the rows where at least one column is null.
    """
    # Create a filter condition for rows where any column is null
    filter_condition = None
    for c in df.columns:
        if filter_condition is None:
            filter_condition = F.col(c).isNull()
        else:
            filter_condition |= F.col(c).isNull()
    
    # Apply the filter condition to the DataFrame
    rows_with_nulls = df.filter(filter_condition)
    return rows_with_nulls

### Reading in all the needed (components) feature sets to create a master table containing features and the target variable

In [0]:
# Reading the calls activity data from Hivestore
monthly_hcp_calls_feats_sdf = spark.sql("SELECT * FROM jivi_new_writer_model.monthly_hcp_calls_feats")
print(
    "Row count: ",
    monthly_hcp_calls_feats_sdf.count(),
    "Column Count: ",
    len(monthly_hcp_calls_feats_sdf.columns),
)

In [0]:
display(monthly_hcp_calls_feats_sdf.limit(15))

In [0]:
# Reading the HCP monthly target spine from Hivestore
hcp_target_spine_sdf = spark.sql("SELECT * FROM jivi_new_writer_model.hcp_target_spine")
print(
    "Row count: ",
    hcp_target_spine_sdf.count(),
    "Column Count: ",
    len(hcp_target_spine_sdf.columns),
)

In [0]:
display(hcp_target_spine_sdf.limit(15))

In [0]:
# Reading the HCP monthly features from Hivestore
hcp_monthly_feats_all_sdf = spark.sql("SELECT * FROM jivi_new_writer_model.all_hcp_monthly_features")
print(
    "Row count: ",
    hcp_monthly_feats_all_sdf.count(),
    "Column Count: ",
    len(hcp_monthly_feats_all_sdf.columns),
)

In [0]:
display(hcp_monthly_feats_all_sdf.limit(100))

In [0]:
# Display the count of null values for each column in the hcp_monthly_feats_all_sdf DataFrame
display(hcp_monthly_feats_all_sdf.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in hcp_monthly_feats_all_sdf.columns]))

### Joining the call activity feature set to the target spine

In [0]:
# changing the column names in the calls table to make the join easier between tables
monthly_hcp_calls_feats_sdf = monthly_hcp_calls_feats_sdf.withColumnRenamed("BAYER_HCP_ID", "BH_ID") \
                                                         .withColumnRenamed("CALL_MONTH", "COHORT_MONTH")

In [0]:
"""Join the HCP target spine Spark DataFrame with the monthly HCP calls features Spark DataFrame on BH_ID == BAYER_HCP_ID and COHORT_MONTH == CALL_MONTH using a left join"""

hcp_monthly_calls_feats_with_target_sdf = hcp_target_spine_sdf.join(
    monthly_hcp_calls_feats_sdf,
    on=["BH_ID", "COHORT_MONTH"],
    how="left"
)

hcp_monthly_calls_feats_with_target_sdf = hcp_monthly_calls_feats_with_target_sdf.orderBy(F.desc("JIVI_NEW_WRITER_FLG"), "BH_ID", "COHORT_MONTH")

print(
    "Row count: ",
    hcp_monthly_calls_feats_with_target_sdf.count(),
    "Column count: ",
    len(hcp_monthly_calls_feats_with_target_sdf.columns),
)

In [0]:
# Ensuring the count of rows in the HCP target spine table is equal to the count of rows in the features table after joining
assert hcp_target_spine_sdf.count() == hcp_monthly_calls_feats_with_target_sdf.count()

In [0]:
display(hcp_monthly_calls_feats_with_target_sdf)

In [0]:
# Use the function with your DataFrame
rows_with_nulls = filter_rows_with_nulls(hcp_monthly_calls_feats_with_target_sdf)

# Display the filtered DataFrame
display(rows_with_nulls)

In [0]:
"""Fill in nulls with zeros because for many HCPs in the target spine dataset, which is created from Overlap data, there may not be any calls or even they might not be on the call plan"""
hcp_monthly_calls_feats_with_target_sdf = hcp_monthly_calls_feats_with_target_sdf.fillna(0)

In [0]:
# checking the target class distribution in the features table
display(hcp_monthly_calls_feats_with_target_sdf.groupBy('JIVI_NEW_WRITER_FLG').agg(F.countDistinct('BH_ID').alias('distinct_BH_ID_cnt')))

### Joining HCP features set to the target spine

In [0]:
# Joining the features table with the HCP target spine table on BH_ID and COHORT_MONTH columns
hcp_monthly_feats_with_target_sdf = hcp_monthly_calls_feats_with_target_sdf.join(
    hcp_monthly_feats_all_sdf,
    on=["BH_ID", "COHORT_MONTH"],
    how="left"
)

# Ordering the resulting DataFrame by JIVI_NEW_WRITER_FLG in descending order, then by BH_ID and COHORT_MONTH
hcp_monthly_feats_with_target_sdf = hcp_monthly_feats_with_target_sdf.orderBy(F.desc("JIVI_NEW_WRITER_FLG"), "BH_ID", "COHORT_MONTH")

print(
    "Row count: ",
    hcp_monthly_feats_with_target_sdf.count(),
    "Column count: ",
    len(hcp_monthly_feats_with_target_sdf.columns),
)

In [0]:
# display(hcp_monthly_feats_with_target_sdf)

In [0]:
# Ensuring the count of rows in the HCP target spine table is equal to the count of rows in the features table after joining
assert hcp_target_spine_sdf.count() == hcp_monthly_feats_with_target_sdf.count()

In [0]:
# Use the function with your DataFrame, excluding column AFFL_WI_INSN
# rows_with_nulls = filter_rows_with_nulls(hcp_monthly_feats_with_target_sdf.drop("AFFL_WI_INSN"))

rows_with_nulls = filter_rows_with_nulls(hcp_monthly_feats_with_target_sdf)

# Display the filtered DataFrame
display(rows_with_nulls)

In [0]:
display(rows_with_nulls.agg(sum("JIVI_NEW_WRITER_FLG")))

In [0]:
# Since NULL rows mostly come from Negative class so we can fill them with zeros
hcp_monthly_feats_with_target_sdf = hcp_monthly_feats_with_target_sdf.fillna(0)

In [0]:
display(hcp_monthly_feats_with_target_sdf)

## Saving the features master table to Hivestore

In [0]:
save_sdf(hcp_monthly_feats_with_target_sdf, 'jivi_new_writer_model', 'hcp_feats_master_w_target')