In [0]:
import pandas as pd
from pyspark.sql import functions as F

In [0]:
%run "../00_config/set-up"

In [0]:
%run "/Workspace/Repos/yuan.niu@bayer.com/heme_new_writer_models_dev_repo/02_data_processing/helper_functions"

In [0]:
start_month = "2019-12"
end_month = "2024-11"
study_period_start_month = "2023-01"
study_period_end_month = "2024-11"
lookback_months = 12

In [0]:
overlap_rx = spark.sql("SELECT * FROM heme_data.overlap_preprocessed")
print("Row count: ", overlap_rx.count(), "Column Count: ", len(overlap_rx.columns))

### Step 1: Identify unique HCPs who have patients (any drugs) between Jan-2023 and Nov-2024.

In [0]:
study_period_hcp_pats_cnt = pd.DataFrame(
  overlap_subset_df
  .query('SHP_YR_MO >= @study_period_start_month and SHP_YR_MO <= @study_period_end_month')
  .groupby('BH_ID')['PATIENT_ID'].nunique()
  .reset_index()
  .rename(columns={'PATIENT_ID': 'pats_cnt'})
  .sort_values(by='pats_cnt')
)
print(study_period_hcp_pats_cnt.shape)

In [0]:
display(study_period_hcp_pats_cnt)

In [0]:
prior_period_hcp_pats_cnt = pd.DataFrame(
  overlap_subset_df
  .query('SHP_YR_MO >= @start_month and SHP_YR_MO < @study_period_start_month')
  .groupby('BH_ID')['PATIENT_ID'].nunique()
  .reset_index()
  .rename(columns={'PATIENT_ID': 'pats_cnt'})
  .sort_values(by='pats_cnt')
)
print(prior_period_hcp_pats_cnt.shape)

In [0]:
display(prior_period_hcp_pats_cnt)

In [0]:
# Finding common BH_IDs between prior period and study period HCP patients count dataframes

# Convert the BH_ID columns to sets
prior_bh_ids = set(prior_period_hcp_pats_cnt['BH_ID'])
print("Number of HCPs with atleast one patient in period prior to study period:", len(prior_bh_ids))
study_bh_ids = set(study_period_hcp_pats_cnt['BH_ID'])
print("Number of HCPs with atleast one patient in study period:", len(study_bh_ids))

# Find the common BH_IDs using set intersection
common_bh_ids = prior_bh_ids.intersection(study_bh_ids)

print("Number of HCPs with atleast one patient in prior period and study period:", len(common_bh_ids))

### Step 2: Identify HCPs who prescribed Jivi between Jan'23 and Nov'24

In [0]:
# Filter the data for the specified drug and period
study_period_jivi_rx = (
  overlap_subset_df
  .query('SHP_YR_MO >= @study_period_start_month and SHP_YR_MO <= @study_period_end_month and PRD_NM == "JIVI"')
)
print(study_period_jivi_rx.shape)
study_period_jivi_rx_hcps = study_period_jivi_rx['BH_ID'].unique().tolist()
print("Number of HCPs with Jivi Rx during study period:", len(study_period_jivi_rx_hcps))

In [0]:
# Filter the data for the specified drug and period
study_period_no_jivi_rx = (
  overlap_subset_df
  .query('SHP_YR_MO >= @study_period_start_month and SHP_YR_MO <= @study_period_end_month and PRD_NM != "JIVI"')
)
print(study_period_no_jivi_rx.shape)
study_period_no_jivi_rx_hcps = study_period_no_jivi_rx['BH_ID'].unique().tolist()
print("Number of HCPs which DO NOT HAVE Jivi Rx during study period:", len(study_period_no_jivi_rx_hcps))

**HCPs with Jivi Prescription during study period but have atleast one patient from Dec-2019 to Nov-2024**

In [0]:
# HCPs with Jivi rx during study period but have atleast 1 one patient from Dec-2019 to Nov-2024
print("Number of HCPs with Jivi Rx during study period but have atleast 1 one patient from Dec-2019 to Nov-2024: ", len(common_bh_ids.intersection(study_period_jivi_rx_hcps)))

### Step 3: For Jivi prescriber HCPs during study period

In [0]:
study_period_jivi_rx_hcps_df = overlap_subset_df[overlap_subset_df['BH_ID'].isin(study_period_jivi_rx_hcps)]
study_period_jivi_rx_hcps_df.shape