In [0]:
import pandas as pd
from pyspark.sql import functions as F

In [0]:
%run "../00_config/set-up"

In [0]:
%run "/Workspace/Repos/yuan.niu@bayer.com/heme_new_writer_models_dev_repo/02_data_processing/helper_functions"

In [0]:
first_month = "2019-12"
last_month = "2024-11"

study_period_start_date = "2023-01-01"
study_period_start_month = "2023-01"
study_period_end_date = "2024-11-30"
study_period_end_month = "2024-11"

# lookback period in months to look back from the first prescription event
rx_lookback_months = 24

In [0]:
def flag_jivi_writers(df, start_month, end_month):
    """
    Flags healthcare providers (HCPs) who have prescribed 'JIVI' within a specified period.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing prescription data.
    start_month (str): The start month of the specified period in 'YYYY-MM' format.
    end_month (str): The end month of the specified period in 'YYYY-MM' format.

    Returns:
    pd.DataFrame: A DataFrame with 'BH_ID' and a Boolean flag 'jivi_rx_flg' indicating if 'JIVI' was prescribed.
    """
    # Group by 'BH_ID' and aggregate 'PRD_NM' as a list of unique values
    grouped_df = (
      df
      .query('SHP_YR_MO >= @start_month and SHP_YR_MO <= @end_month')
      .groupby('BH_ID')['PRD_NM'].agg(lambda x: list(set(x)))
      )

    # Flag HCPs where 'JIVI' is in the list of products 'PRD_NM' prescribed
    filtered_df = grouped_df.apply(lambda r: 'JIVI' in r).reset_index(name='jivi_rx_flg')
    
    return filtered_df

In [0]:
def get_first_rx_event(data, date_col, id_col, drg_nm_col, drg_nm, study_period_start_date, study_period_end_date, rx_lookback_months):
    """
    This function processes prescription data to find the first prescription event for a specific drug within a study period.
    
    Parameters:
    data (pd.DataFrame): The input data containing prescription records.
    date_col (str): The name of the column containing prescription dates.
    id_col (str): The name of the column containing unique identifiers for healthcare providers.
    drg_nm_col (str): The name of the column containing drug names.
    drg_nm (str): The specific drug name to filter the data. If None, no filtering on drug name is applied.
    study_period_start_date (str): The start date of the study period in 'YYYY-MM-DD' format.
    study_period_end_date (str): The end date of the study period in 'YYYY-MM-DD' format.
    rx_lookback_months (int): The number of months to look back from the first prescription date.
    
    Returns:
    pd.DataFrame: A DataFrame containing the first prescription date and related information for each healthcare provider.
    """
    
    # Convert the date column to datetime format
    data[date_col] = pd.to_datetime(data[date_col])
    
    # Sort the data by healthcare provider ID and date
    data = data.sort_values(by=[id_col, date_col])

    # Filter data for the specific drug prescriptions within the study period if drg_nm is provided
    if drg_nm:
        df_rx = data[(data[drg_nm_col] == drg_nm) &
                     (data[date_col] >= study_period_start_date) & 
                     (data[date_col] <= study_period_end_date)]
    else:
        df_rx = data[(data[date_col] >= study_period_start_date) & 
                     (data[date_col] <= study_period_end_date)]

    # Group by healthcare provider ID and get the first prescription date for each group
    first_rx = df_rx.groupby(id_col).first().reset_index()

    # Define column names based on drug name
    first_rx_date_col = f'OVP_FST_{drg_nm}_DT' if drg_nm else 'OVP_FST_RX_DT'
    first_rx_month_col = f'OVP_FST_{drg_nm}_MONTH' if drg_nm else 'OVP_FST_RX_MONTH'

    # Rename the date column to indicate the first prescription date for the specific drug
    first_rx.rename(columns={date_col: first_rx_date_col}, inplace=True)
    
    # Calculate the look-back start date based on the first prescription date
    first_rx['rx_look_back_start_dt'] = first_rx[first_rx_date_col] - pd.DateOffset(months=rx_lookback_months + 1)
    
    # Extract the year-month format for the first prescription date and look-back start date
    first_rx[first_rx_month_col] = first_rx[first_rx_date_col].dt.strftime('%Y-%m')
    first_rx['rx_look_back_start_month'] = first_rx['rx_look_back_start_dt'].dt.strftime('%Y-%m')
    
    # Return the relevant columns
    return first_rx[[id_col, first_rx_date_col, first_rx_month_col, 'rx_look_back_start_dt', 'rx_look_back_start_month']]

In [0]:
overlap_subset = spark.sql("SELECT * FROM heme_data.overlap_preprocessed")
print("Row count: ", overlap_subset.count(), "Column Count: ", len(overlap_subset.columns))

In [0]:
display(overlap_subset.limit(15))

In [0]:
overlap_subset_pdf = overlap_subset.toPandas()

**Number of unique patients in the Study period**

In [0]:
unique_patient_count = (
    overlap_subset
    .filter((F.col('SHP_YR_MO') >= study_period_start_month) & (F.col('SHP_YR_MO') <= study_period_end_month))
    .agg(F.countDistinct('PATIENT_ID').alias('unique_pat_cnt'))
)

display(unique_patient_count)

**Number of unique JIVI patients in the Study period**

In [0]:
unique_jivi_patient_count = (
    overlap_subset
    .filter(
        (F.col('SHP_YR_MO') >= study_period_start_month) & 
        (F.col('SHP_YR_MO') <= study_period_end_month) & 
        (F.col('PRD_NM') == 'JIVI')
    )
    .agg(F.countDistinct('PATIENT_ID').alias('unique_JIVI_pat_cnt'))
)

display(unique_jivi_patient_count)

### Step 1: Identify unique HCPs who have patients (any drugs) between Jan-2023 and Nov-2024.

In [0]:
# Calculate the count of distinct patients per BH_ID for the study period
study_period_hcp_pats_cnt = (
  overlap_subset
  .filter((F.col('SHP_YR_MO') >= study_period_start_month) & (F.col('SHP_YR_MO') <= study_period_end_month))
  .groupBy('BH_ID')
  .agg(F.countDistinct('PATIENT_ID').alias('pats_cnt'))
  .orderBy('pats_cnt')
)
print("Row count: ", study_period_hcp_pats_cnt.count(), "Column Count: ", len(study_period_hcp_pats_cnt.columns))

In [0]:
display(study_period_hcp_pats_cnt)

### Check: How many HCPs have at least one patient (any drugs) in between Dec'19 and Dec'22

In [0]:
prior_period_hcp_pats_cnt = (
    overlap_subset
    .filter((F.col('SHP_YR_MO') >= first_month) & (F.col('SHP_YR_MO') < study_period_start_month))
    .groupBy('BH_ID')
    .agg(F.countDistinct('PATIENT_ID').alias('pats_cnt'))
    .orderBy('pats_cnt')
)
print("Row count: ", prior_period_hcp_pats_cnt.count(), "Column Count: ", len(prior_period_hcp_pats_cnt.columns))

In [0]:
display(prior_period_hcp_pats_cnt)

In [0]:
# Finding common BH_IDs between prior period and study period HCP patients count dataframes

prior_period_hcp_pats_cnt_pdf = prior_period_hcp_pats_cnt.toPandas()
study_period_hcp_pats_cnt_pdf = study_period_hcp_pats_cnt.toPandas()

# Convert the BH_ID columns to sets
prior_bh_ids = set(prior_period_hcp_pats_cnt_pdf['BH_ID'])
print("Number of HCPs with atleast one patient in period prior to study period:", len(prior_bh_ids))
study_bh_ids = set(study_period_hcp_pats_cnt_pdf['BH_ID'])
print("Number of HCPs with atleast one patient in study period:", len(study_bh_ids))

# Find the common BH_IDs using set intersection
common_bh_ids = prior_bh_ids.intersection(study_bh_ids)

print("Number of HCPs with atleast one patient in prior period and one patient in study period:", len(common_bh_ids))

In [0]:
13161+12481-7318

### Step 2: Identify HCPs who prescribed Jivi between Jan'23 and Nov'24

In [0]:
# Flag JIVI writers in the study period
study_period_flagged_hcps = flag_jivi_writers(
  df = overlap_subset_pdf, 
  start_month = study_period_start_month,
  end_month = study_period_end_month,
  )

In [0]:
study_period_jivi_rx_hcps = study_period_flagged_hcps[study_period_flagged_hcps.jivi_rx_flg == True].BH_ID.unique().tolist()
study_period_no_jivi_rx_hcps = study_period_flagged_hcps[study_period_flagged_hcps.jivi_rx_flg == False].BH_ID.unique().tolist()

In [0]:
print("Number of HCPs with prescriptions in the study period: ", study_period_flagged_hcps.shape[0])

In [0]:
print("Number of HCPs with Jivi Rx during study period: ", len(study_period_jivi_rx_hcps))

In [0]:
print("Number of HCPs which DO NOT HAVE Jivi Rx during study period: ", len(study_period_no_jivi_rx_hcps))

**HCPs with Jivi Prescription during study period but have atleast one patient from Dec-2019 to Nov-2024**

In [0]:
# HCPs with Jivi rx during study period but have atleast one patient in prior period and one in study period
inclusion_jivi_bh_ids = common_bh_ids.intersection(study_period_jivi_rx_hcps)
print("Number of HCPs with Jivi Rx during study period but have atleast one patient from Dec-2019 to Dec-2022 and one patient from Jan-2023 to Nov-2024: ", len(inclusion_jivi_bh_ids))

%md
**HCPs with NO Jivi Prescription during study period but have atleast one patient from Dec-2019 to Nov-2024**

In [0]:
# HCPs without any JIVI RX during study period but have atleast 1 one patient from Dec-2019 to Nov-2024
inclusion_bh_ids = common_bh_ids.intersection(study_period_no_jivi_rx_hcps)
print("Number of HCPs with NO Jivi Rx during study period but have atleast 1 one patient from Dec-2019 to Nov-2024: ", len(inclusion_bh_ids))

In [0]:
# hcp_patient_cnts = overlap_subset_pdf.query('SHP_YR_MO >= @first_month and SHP_YR_MO <= @last_month').groupby('BH_ID')['PATIENT_ID'].nunique().sort_values(ascending=True).reset_index(name='pats_cnt')
# hcps_with_atleast_one_pat = hcp_patient_cnts.query('pats_cnt >= 1')


In [0]:
# hcps_with_atleast_one_pat.BH_ID.nunique()

In [0]:
# len(set(study_period_jivi_rx_hcps).intersection(set(hcps_with_atleast_one_pat.BH_ID)))

### Step 3: For Jivi NEW prescriber HCPs during study period

In [0]:
study_period_jivi_rx_hcps_data = overlap_subset_pdf[overlap_subset_pdf['BH_ID'].isin(study_period_jivi_rx_hcps)]
study_period_jivi_rx_hcps_data.shape

In [0]:
# display(study_period_jivi_rx_hcps_data)

In [0]:
jivi_first_rx_dates = get_first_rx_event(
data = study_period_jivi_rx_hcps_data.copy(),
date_col = 'SHP_DT',
id_col = 'BH_ID',
drg_nm_col = 'PRD_NM',
drg_nm = 'JIVI',
study_period_start_date = study_period_start_date,
study_period_end_date = study_period_end_date,
rx_lookback_months = rx_lookback_months
)

In [0]:
jivi_first_rx_dates.display()

In [0]:
study_period_jivi_rx_hcps_with_first_rx = pd.merge(study_period_jivi_rx_hcps_data.copy(), jivi_first_rx_dates, on='BH_ID', how='inner')
print(study_period_jivi_rx_hcps_with_first_rx.shape)

In [0]:
display(study_period_jivi_rx_hcps_with_first_rx)

In [0]:
study_period_jivi_rx_hcps_with_first_rx['BH_ID'].nunique()

In [0]:
# function to use in apply() on grouped data.
def count_jivi_records_in_lookback(grouped_data):
  jivi_res_df = grouped_data.query('PRD_NM == "JIVI" and SHP_YR_MO < OVP_FST_JIVI_MONTH and SHP_YR_MO >= rx_look_back_start_month')
  jivi_rx_cnt = jivi_res_df.shape[0] # Number of Jivi Rx records within the lookback period window
  jivi_pat_cnt = jivi_res_df['PATIENT_ID'].nunique()
  # checking other prescriptions
  other_res_df = grouped_data.query('PRD_NM != "JIVI" and SHP_YR_MO < OVP_FST_JIVI_MONTH and SHP_YR_MO >= rx_look_back_start_month')
  other_rx_cnt = other_res_df.shape[0]
  other_pat_cnt = other_res_df['PATIENT_ID'].nunique()

  res_series = pd.Series({
        'jivi_rx_cnt_in_lookback': jivi_rx_cnt,
        'jivi_pat_cnt_in_lookback': jivi_pat_cnt,
        'other_rx_cnt_in_lookback': other_rx_cnt,
        'other_pat_cnt_in_lookback': other_pat_cnt,
    })
  return res_series

In [0]:
jivi_hcp_jivi_rx_cnt_in_lookback = study_period_jivi_rx_hcps_with_first_rx.groupby('BH_ID').apply(count_jivi_records_in_lookback).reset_index()

In [0]:
"""Data for the HCPs with Jivi Rx during study period and fullfilling the inclusion criteria of 1 patient from Dec-2019 to Nov-2024. The dataframe contains the count of JIVI RX in the lookback period so that we can identify who is a new writer and who is an existing writer. It also contains other prescriptions in the lookback period just to see if the new writers also had other prescriptions in the lookback period. """

jivi_hcp_jivi_rx_cnt_in_lookback['JIVI_NEW_WRITER_FLG'] = (jivi_hcp_jivi_rx_cnt_in_lookback['jivi_rx_cnt_in_lookback'] == 0).astype(int)
jivi_hcp_jivi_rx_cnt_in_lookback['JIVI_EXISTING_WRITER_FLG'] = (jivi_hcp_jivi_rx_cnt_in_lookback['jivi_rx_cnt_in_lookback'] > 0).astype(int)

### This table gives results for Step 3A, 3B

In [0]:
display(jivi_hcp_jivi_rx_cnt_in_lookback)

In [0]:
jivi_hcp_jivi_rx_cnt_in_lookback.sum()

In [0]:
jivi_new_writer_hcp_ids = jivi_hcp_jivi_rx_cnt_in_lookback.query('JIVI_NEW_WRITER_FLG == 1 and JIVI_EXISTING_WRITER_FLG == 0')['BH_ID'].tolist()
print(len(jivi_new_writer_hcp_ids))

### Check: number of Jivi Rx and Jivi patients per new writer

In [0]:
filtered_overlap_subset_pdf = overlap_subset_pdf[overlap_subset_pdf['BH_ID'].isin(jivi_new_writer_hcp_ids)]

jivi_records_per_bh_id = filtered_overlap_subset_pdf[filtered_overlap_subset_pdf['PRD_NM'] == 'JIVI'].groupby('BH_ID').size().reset_index(name='JIVI_records')
jivi_patients_per_bh_id = filtered_overlap_subset_pdf[filtered_overlap_subset_pdf['PRD_NM'] == 'JIVI'].groupby('BH_ID')['PATIENT_ID'].nunique().reset_index(name='JIVI_patients')

jivi_new_writers_rx_pat_cnt = pd.merge(jivi_records_per_bh_id, jivi_patients_per_bh_id, on='BH_ID')


In [0]:
display(jivi_new_writers_rx_pat_cnt)

### Step 4: For forever Non-wrtiers of JIVI
1) Get the SHP_DT of the first Rx by HCP in the observation period Jan'23 - Nov'24; You can create a temp variable (e.g. OVP_FST_RX_MTH) to store this info for the ease of next step 
2) Count the number of Jivi Rx for each HCP in the 24 months window prior to OVP_FST_RX_MTH

In [0]:
study_period_other_rx_hcps_data = overlap_subset_pdf[overlap_subset_pdf['BH_ID'].isin(study_period_no_jivi_rx_hcps)]
study_period_other_rx_hcps_data.shape

In [0]:
first_rx_dates = get_first_rx_event(
data = study_period_other_rx_hcps_data.copy(),
date_col = 'SHP_DT',
id_col = 'BH_ID',
drg_nm_col = 'PRD_NM',
drg_nm=None,
study_period_start_date = study_period_start_date,
study_period_end_date = study_period_end_date,
rx_lookback_months = rx_lookback_months
)

In [0]:
display(first_rx_dates)

In [0]:
study_period_other_rx_hcps_with_first_rx = pd.merge(study_period_other_rx_hcps_data.copy(), first_rx_dates, on='BH_ID', how='inner')
print(study_period_other_rx_hcps_with_first_rx.shape)

In [0]:
display(study_period_other_rx_hcps_with_first_rx)

In [0]:
# function to use in apply() on grouped data.
def count_records_in_lookback(grouped_data):
  jivi_res_df = grouped_data.query('PRD_NM == "JIVI" and SHP_YR_MO < OVP_FST_RX_MONTH and SHP_YR_MO >= rx_look_back_start_month')
  jivi_rx_cnt = jivi_res_df.shape[0] # Number of Jivi Rx records within the lookback period window
  jivi_pat_cnt = jivi_res_df['PATIENT_ID'].nunique()
  # checking other prescriptions
  other_res_df = grouped_data.query('PRD_NM != "JIVI" and SHP_YR_MO < OVP_FST_RX_MONTH and SHP_YR_MO >= rx_look_back_start_month')
  other_rx_cnt = other_res_df.shape[0]
  other_pat_cnt = other_res_df['PATIENT_ID'].nunique()

  res_series = pd.Series({
        'jivi_rx_cnt_in_lookback': jivi_rx_cnt,
        'jivi_pat_cnt_in_lookback': jivi_pat_cnt,
        'other_rx_cnt_in_lookback': other_rx_cnt,
        'other_pat_cnt_in_lookback': other_pat_cnt,
    })
  return res_series

In [0]:
other_hcp_jivi_rx_cnt_in_lookback = study_period_other_rx_hcps_with_first_rx.groupby('BH_ID').apply(count_records_in_lookback).reset_index()

In [0]:
"""Data for the HCPs without any Jivi Rx during study period and fullfilling the inclusion criteria of 1 patient from Dec-2019 to Nov-2024. The dataframe contains the count of JIVI RX in the lookback period so that we can identify who is a non-Jivi writer and who is an discontinued writer. It also contains other brand prescriptions in the lookback period just to see if the new writers also had other brand prescriptions in the lookback period. """


other_hcp_jivi_rx_cnt_in_lookback['JIVI_NEW_WRITER_FLG'] = (
    other_hcp_jivi_rx_cnt_in_lookback['jivi_rx_cnt_in_lookback']
    .apply(lambda x: 0 if x == 0 else 1)
)

other_hcp_jivi_rx_cnt_in_lookback['JIVI_DSCTND_WRITER_FLG'] = (other_hcp_jivi_rx_cnt_in_lookback['jivi_rx_cnt_in_lookback'] > 0).astype(int)

other_hcp_jivi_rx_cnt_in_lookback['JIVI_NEW_WRITER_FLG'] = (
    other_hcp_jivi_rx_cnt_in_lookback['jivi_rx_cnt_in_lookback']
    .apply(lambda x: 0 if x > 0 else x)
)

In [0]:
display(other_hcp_jivi_rx_cnt_in_lookback)

In [0]:
other_hcp_jivi_rx_cnt_in_lookback.sum()

In [0]:
# Forever non-writers
forever_non_writer_hcps = (other_hcp_jivi_rx_cnt_in_lookback['JIVI_DSCTND_WRITER_FLG'] != 1).sum()
print("Count of forever non-writers (JIVI): ", forever_non_writer_hcps)

### Step 5a: Select Jivi new writers (FLAG_JIVI_NEW_WRITER = 1 & FLAG_JIVI_EXISTING_WRITER = 0); get their OVP_JIVI_MONTH and set COHORT_MONTH = OVP_JIVI_MONTH

In [0]:
jivi_new_writer_hcp_ids_w_flag = jivi_hcp_jivi_rx_cnt_in_lookback.query('JIVI_NEW_WRITER_FLG == 1 and JIVI_EXISTING_WRITER_FLG == 0')[['BH_ID', 'JIVI_NEW_WRITER_FLG']]
print(jivi_new_writer_hcp_ids_w_flag.shape)

In [0]:
# jivi_new_writer_hcp_ids_w_flag_fst_date= pd.merge(jivi_new_writer_hcp_ids_w_flag, 
#                                                   jivi_first_rx_dates[['BH_ID', 'OVP_FST_JIVI_MONTH']], 
#                                                   on='BH_ID', how='inner')
# print(jivi_new_writer_hcp_ids_w_flag_fst_date.shape)

In [0]:
jivi_new_writer_hcp_ids_w_flag_fst_date= pd.merge(jivi_new_writer_hcp_ids_w_flag, 
                                                  study_period_jivi_rx_hcps_with_first_rx[['BH_ID', 'OVP_FST_JIVI_MONTH']].drop_duplicates(subset='BH_ID', keep='first'), 
                                                  on='BH_ID', how='inner')
print(jivi_new_writer_hcp_ids_w_flag_fst_date.shape)

In [0]:
jivi_new_writer_hcp_ids_w_flag_fst_date.rename(columns={'OVP_FST_JIVI_MONTH': 'COHORT_MONTH'}, inplace=True)
# re-ordering columns
jivi_new_writer_hcp_ids_w_flag_fst_date = jivi_new_writer_hcp_ids_w_flag_fst_date[['BH_ID', 'COHORT_MONTH', 'JIVI_NEW_WRITER_FLG']]

In [0]:
jivi_new_writer_hcp_ids_w_flag_fst_date.display()

### Step 5b: Select non writers (FLAG_JIVI_NEW_WRITER = 0 & FLAG_JIVI_DSCTND_WRITER = 0), create COHORT_MONTH for each HCP if there are Rx activities from the HCP at that COHORT_MONTH

In [0]:
forever_non_writer_hcp_ids_w_flag = other_hcp_jivi_rx_cnt_in_lookback.query('JIVI_NEW_WRITER_FLG == 0 and JIVI_DSCTND_WRITER_FLG == 0')[['BH_ID', 'JIVI_NEW_WRITER_FLG']]
print(forever_non_writer_hcp_ids_w_flag.shape)

In [0]:
forever_non_writer_hcp_ids_w_flag_fst_date= pd.merge(forever_non_writer_hcp_ids_w_flag, 
                                                  study_period_other_rx_hcps_with_first_rx[['BH_ID', 'OVP_FST_RX_MONTH']].drop_duplicates(subset='BH_ID', keep='first'), 
                                                  on='BH_ID', how='inner')
print(forever_non_writer_hcp_ids_w_flag_fst_date.shape)

In [0]:
forever_non_writer_hcp_ids_w_flag_fst_date.rename(columns={'OVP_FST_RX_MONTH': 'COHORT_MONTH'}, inplace=True)
# re-ordering columns
forever_non_writer_hcp_ids_w_flag_fst_date = forever_non_writer_hcp_ids_w_flag_fst_date[['BH_ID', 'COHORT_MONTH', 'JIVI_NEW_WRITER_FLG']]

In [0]:
display(forever_non_writer_hcp_ids_w_flag_fst_date)

In [0]:
# Concatenating the dataframes
hcp_target_spine = pd.concat([jivi_new_writer_hcp_ids_w_flag_fst_date, forever_non_writer_hcp_ids_w_flag_fst_date])
display(hcp_target_spine)

In [0]:
print(hcp_target_spine.shape)

In [0]:
hcp_target_spine[hcp_target_spine.BH_ID.duplicated()]

In [0]:
hcp_target_spine_sdf = spark.createDataFrame(hcp_target_spine)

In [0]:
# save_sdf(hcp_target_spine_sdf, 'heme_data', 'hcp_target_spine')

In [0]:
set1 = set(jivi_new_writer_hcp_ids_w_flag_fst_date.BH_ID)
set2 = set(forever_non_writer_hcp_ids_w_flag_fst_date.BH_ID)
len(set2.intersection(set1))

In [0]:
print(len(set1))
print(len(set2))

In [0]:
# Group by COHORT_MONTH and JIVI_NEW_WRITER_FLG, then count the occurrences
monthly_counts = (
    hcp_target_spine_sdf
    .groupBy('COHORT_MONTH', 'JIVI_NEW_WRITER_FLG')
    .count()
)

# Pivot the data to get a wide format
wide_format_counts = (
    monthly_counts
    .groupBy('COHORT_MONTH')
    .pivot('JIVI_NEW_WRITER_FLG')
    .sum('count')
    .orderBy('COHORT_MONTH')
    .withColumnRenamed('0', 'jivi_non_writers_cnt')
    .withColumnRenamed('1', 'jivi_new_writers_cnt')
)

display(wide_format_counts)