In [0]:
import pandas as pd
from pyspark.sql import functions as F

In [0]:
%run "../00_config/set-up"

In [0]:
%run "/Workspace/Repos/yuan.niu@bayer.com/heme_new_writer_models_dev_repo/02_data_processing/helper_functions"

In [0]:
start_month = "2019-12"
end_month = "2024-11"

study_period_start_date = "2023-01-01"
study_period_start_month = "2023-01"
study_period_end_date = "2024-11-30"
study_period_end_month = "2024-11"

rx_lookback_months = 24

In [0]:
overlap_subset = spark.sql("SELECT * FROM heme_data.overlap_preprocessed")
print("Row count: ", overlap_subset.count(), "Column Count: ", len(overlap_subset.columns))

In [0]:
display(overlap_subset.limit(15))

In [0]:
overlap_subset_pdf = overlap_subset.toPandas()

### Step 1: Identify unique HCPs who have patients (any drugs) between Jan-2023 and Nov-2024.

In [0]:
# Calculate the count of distinct patients per BH_ID for the study period
study_period_hcp_pats_cnt = (
  overlap_subset
  .filter((F.col('SHP_YR_MO') >= study_period_start_month) & (F.col('SHP_YR_MO') <= study_period_end_month))
  .groupBy('BH_ID')
  .agg(F.countDistinct('PATIENT_ID').alias('pats_cnt'))
  .orderBy('pats_cnt')
)
print("Row count: ", study_period_hcp_pats_cnt.count(), "Column Count: ", len(study_period_hcp_pats_cnt.columns))

In [0]:
display(study_period_hcp_pats_cnt)

### Check: How many HCPs have at least one patient (any drugs) in between Dec'19 and Dec'22

In [0]:
prior_period_hcp_pats_cnt = (
    overlap_subset
    .filter((F.col('SHP_YR_MO') >= start_month) & (F.col('SHP_YR_MO') < study_period_start_month))
    .groupBy('BH_ID')
    .agg(F.countDistinct('PATIENT_ID').alias('pats_cnt'))
    .orderBy('pats_cnt')
)
print("Row count: ", prior_period_hcp_pats_cnt.count(), "Column Count: ", len(prior_period_hcp_pats_cnt.columns))

In [0]:
display(prior_period_hcp_pats_cnt)

In [0]:
# Finding common BH_IDs between prior period and study period HCP patients count dataframes

prior_period_hcp_pats_cnt_pdf = prior_period_hcp_pats_cnt.toPandas()
study_period_hcp_pats_cnt_pdf = study_period_hcp_pats_cnt.toPandas()

# Convert the BH_ID columns to sets
prior_bh_ids = set(prior_period_hcp_pats_cnt_pdf['BH_ID'])
print("Number of HCPs with atleast one patient in period prior to study period:", len(prior_bh_ids))
study_bh_ids = set(study_period_hcp_pats_cnt_pdf['BH_ID'])
print("Number of HCPs with atleast one patient in study period:", len(study_bh_ids))

# Find the common BH_IDs using set intersection
common_bh_ids = prior_bh_ids.intersection(study_bh_ids)

print("Number of HCPs with atleast one patient in prior period and study period:", len(common_bh_ids))

### Step 2: Identify HCPs who prescribed Jivi between Jan'23 and Nov'24

In [0]:
# Filter the data for the specified drug and period
study_period_jivi_rx = (
  overlap_subset_pdf
  .query('SHP_YR_MO >= @study_period_start_month and SHP_YR_MO <= @study_period_end_month and PRD_NM == "JIVI"')
)
print(study_period_jivi_rx.shape)
study_period_jivi_rx_hcps = study_period_jivi_rx['BH_ID'].unique().tolist()
print("Number of HCPs with Jivi Rx during study period:", len(study_period_jivi_rx_hcps))

In [0]:
# Filter the data for the specified drug and period
study_period_no_jivi_rx = (
  overlap_subset_pdf
  .query('SHP_YR_MO >= @study_period_start_month and SHP_YR_MO <= @study_period_end_month and PRD_NM != "JIVI"')
)
print(study_period_no_jivi_rx.shape)
study_period_no_jivi_rx_hcps = study_period_no_jivi_rx['BH_ID'].unique().tolist()
print("Number of HCPs which DO NOT HAVE Jivi Rx during study period:", len(study_period_no_jivi_rx_hcps))

**HCPs with Jivi Prescription during study period but have atleast one patient from Dec-2019 to Nov-2024**

In [0]:
# HCPs with Jivi rx during study period but have atleast 1 one patient from Dec-2019 to Nov-2024
inclusion_bh_ids = common_bh_ids.intersection(study_period_jivi_rx_hcps)
print("Number of HCPs with Jivi Rx during study period but have atleast 1 one patient from Dec-2019 to Nov-2024: ", len(inclusion_bh_ids))

### Step 3: For Jivi prescriber HCPs during study period

In [0]:
study_period_jivi_rx_hcps_data = overlap_subset_pdf[overlap_subset_pdf['BH_ID'].isin(inclusion_bh_ids)]
study_period_jivi_rx_hcps_data.shape

In [0]:
# display(study_period_jivi_rx_hcps_data)

In [0]:
def get_first_rx_event(data, date_col, id_col, drg_nm_col, drg_nm, study_period_start_date, study_period_end_date, rx_lookback_months):
    """
    This function processes prescription data to find the first prescription event for a specific drug within a study period.
    
    Parameters:
    data (pd.DataFrame): The input data containing prescription records.
    date_col (str): The name of the column containing prescription dates.
    id_col (str): The name of the column containing unique identifiers for healthcare providers.
    drg_nm_col (str): The name of the column containing drug names.
    drg_nm (str): The specific drug name to filter the data.
    study_period_start_date (datetime): The start date of the study period.
    study_period_end_date (datetime): The end date of the study period.
    rx_lookback_months (int): The number of months to look back from the first prescription date.
    
    Returns:
    pd.DataFrame: A DataFrame containing the first prescription date and related information for each healthcare provider.
    """
    
    # Convert the date column to datetime format
    data[date_col] = pd.to_datetime(data[date_col])
    
    # Sort the data by healthcare provider ID and date
    data = data.sort_values(by=[id_col, date_col])

    # Filter data for the specific drug prescriptions within the study period
    df_rx = data[(data[drg_nm_col] == drg_nm) &
                 (data[date_col] >= study_period_start_date) & 
                 (data[date_col] <= study_period_end_date)]

    # Group by healthcare provider ID and get the first prescription date for each group
    first_rx = df_rx.groupby('BH_ID').first().reset_index()

    # Rename the date column to indicate the first prescription date for the specific drug
    first_rx.rename(columns={date_col: f'OVP_FST_{drg_nm}_DT'}, inplace=True)
    
    # Calculate the look-back start date based on the first prescription date
    first_rx['rx_look_back_start_dt'] = first_rx[f'OVP_FST_{drg_nm}_DT'] - pd.DateOffset(months=rx_lookback_months + 1)
    
    # Extract the year-month format for the first prescription date and look-back start date
    first_rx[f'OVP_FST_{drg_nm}_MONTH'] = first_rx[f'OVP_FST_{drg_nm}_DT'].dt.strftime('%Y-%m')
    first_rx['rx_look_back_start_month'] = first_rx['rx_look_back_start_dt'].dt.strftime('%Y-%m')
    
    # Return the relevant columns
    return first_rx[['BH_ID', f'OVP_FST_{drg_nm}_DT', f'OVP_FST_{drg_nm}_MONTH', 'rx_look_back_start_dt', 'rx_look_back_start_month']]

In [0]:
first_rx_dates = get_first_rx_event(
data = study_period_jivi_rx_hcps_data.copy(),
date_col = 'SHP_DT',
id_col = 'BH_ID',
drg_nm_col = 'PRD_NM',
drg_nm = 'JIVI',
study_period_start_date = study_period_start_date,
study_period_end_date = study_period_end_date,
rx_lookback_months = rx_lookback_months
)

In [0]:
first_rx_dates.display()

In [0]:
merged_data = pd.merge(study_period_jivi_rx_hcps_data.copy(), first_rx_dates, on='BH_ID', how='inner')
print(merged_data.shape)

In [0]:
display(merged_data)

In [0]:
merged_data.BH_ID.nunique()

In [0]:
def count_jivi_records(grouped_data):
  res_df = grouped_data.query('PRD_NM == "JIVI" and SHP_YR_MO < OVP_FST_JIVI_MONTH and SHP_YR_MO >= rx_look_back_start_month')
  return res_df.shape[0]

In [0]:
jivi_rx_hcp = merged_data.groupby('BH_ID').apply(count_jivi_records).reset_index(name='jivi_rx_cnt')
display(jivi_rx_hcp)

In [0]:
import pandas as pd

# Assuming merged_data is already defined and contains the necessary columns

# Group by BH_ID and aggregate the necessary columns
grouped_data = merged_data.groupby('BH_ID').agg({
    'SHP_DT': list,
    'PRD_NM': list,
    'SHP_YR_MO': list,
    'OVP_FST_JIVI_MONTH': 'first',
    'rx_look_back_start_month': 'first'
}).reset_index()

# Function to count Jivi Rx within the 24 months window
def count_jivi_rx(row):
    count = 0
    for prd_nm, shp_yr_mo in zip(row['PRD_NM'], row['SHP_YR_MO']):
        if prd_nm == "JIVI" and row['rx_look_back_start_month'] <= shp_yr_mo < row['OVP_FST_JIVI_MONTH']:
            count += 1
    return count

# Apply the function to count Jivi Rx for each HCP
grouped_data['jivi_rx_count'] = grouped_data.apply(count_jivi_rx, axis=1)
display(grouped_data)
# Select the relevant columns
jivi_rx_count = grouped_data[['BH_ID', 'jivi_rx_count']]

display(jivi_rx_count)

In [0]:
def get_first_rx_month(overlap_rx_df, id_colname, date_colname, prd_nm):
    # Sort by SHP_DT to ensure the first prescription is identified correctly
    overlap_rx_df = overlap_rx_df.sort_values(by=date_colname)