In [0]:
from pyspark.sql import functions as F
import pandas as pd
import random

In [0]:
%run "/Workspace/Repos/yuan.niu@bayer.com/heme_new_writer_models_dev_repo/02_data_processing/helper_functions"

## EDA for new Overlap data in PHCDW.PHCDW_CDM.TMP_HEM_OVLP_DLT_VW

### The Overlap dataset is the one covering Patients & HCPs from both Symphony Claims data and Speciality Pharmacy data

In [0]:
"""

# #--------------------------------------------CDP Connections -------------------------------------------------------
prodUsername = dbutils.secrets.get(scope="US_DSAA_PROD_GROUP_Snowflake_Admin", key="Prod-Username")
prodPassword = dbutils.secrets.get(scope="US_DSAA_PROD_GROUP_Snowflake_Admin", key="Prod-Password")
 
def get_data_snowflake(query, Schema = "PHCDW_DSAA", Role="PROD_PHCDW_PROD_CYRUS_GREY_PHCDW_CDM_RWC"):
  sfOptions = {
      "sfURL" : "bayer_cphcdp_prod.us-east-1.snowflakecomputing.com:443",
      "sfUser" : prodUsername,
      "sfPassword" : prodPassword,
      "sfRole" : Role,
      "sfSchema" : Schema,
      "sfDatabase" : "PHCDW",
      "sfWarehouse" : "PROD_CYRUS_BI_WH",
      "purge" : "off",
      "autopushdown" : "on"
    }
  return spark.read.format("net.snowflake.spark.snowflake").options(**sfOptions).option('query',query).load()

  """

In [0]:
overlap_raw_data = spark.sql("SELECT * FROM heme_data.overlap_rx")
print('Row count: ', overlap_raw_data.count(), 'Column Count: ', len(overlap_raw_data.columns))

In [0]:
display(overlap_raw_data.limit(15))

**Checking NULL percentages in columns**

In [0]:
res_df = spark_data_profiler(overlap_raw_data)
display(res_df)

In [0]:
overlap_raw_data.select('IU').describe().show()

In [0]:
# Converting the original overlap data spark dataframe to pandas dataframe
""" Convert DecimalType columns to float to avoid UserWarning: The conversion of DecimalType columns is inefficient and may take a long time. Column names: [IU, PTD_FNL_CLM_AMT] If those columns are not necessary, you may consider dropping them or converting to primitive types before the conversion."""
overlap_raw_data = overlap_raw_data.withColumn("IU", overlap_raw_data["IU"].cast("float"))
overlap_raw_data = overlap_raw_data.withColumn("PTD_FNL_CLM_AMT", overlap_raw_data["PTD_FNL_CLM_AMT"].cast("float"))

# Convert to Pandas DataFrame
overlap_data_df = overlap_raw_data.toPandas()

In [0]:
# Set the display option to avoid scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Print the description of the 'IU' column
print(overlap_data_df['IU'].describe())

In [0]:
overlap_raw_data.select('AFFL_SEG').distinct().show()
overlap_raw_data.select('AFFL_TYP').distinct().show()
overlap_raw_data.select('WRITE_DATE').distinct().show()
overlap_raw_data.select('PAYR_TYP').distinct().show()
overlap_raw_data.select('PAYR_NM').distinct().show()

**TODO:**
- Create features which can be used for ML
	- HCPs switching to Kovaltry
	- HCP domain features
	- Patient domain features
	- Affiliation domain features

### Checking how many unique Patient IDs are associated to each HCP ID

In [0]:
# Group by 'BH_ID' and count distinct 'PATIENT_ID' and 'PTNT_ID' using Spark DataFrame
# This code groups the 'overlap_raw_data' DataFrame by the 'BH_ID' column.
# It then counts the number of distinct 'PATIENT_ID' values and the number of distinct 'PTNT_ID' values for each 'BH_ID'.
# The result is a new DataFrame 'hcp_result' with three columns:
# 'BH_ID', 'PATIENT_ID_count', and 'PTNT_ID_count', where 'PATIENT_ID_count' represents the count of distinct 'PATIENT_ID' 
# and 'PTNT_ID_count' represents the count of distinct 'PTNT_ID' for each 'BH_ID'.

hcp_result = overlap_raw_data.groupBy('BH_ID').agg(
    F.count_distinct('PATIENT_ID').alias('PATIENT_ID_count'),
    F.count_distinct('PTNT_ID').alias('PTNT_ID_count')
).sort(['PATIENT_ID_count', 'PTNT_ID_count'], ascending=False)

display(hcp_result.limit(10))

### Checking how many unique HCP IDs are associated to each Patient ID

In [0]:
# Group by 'PATIENT_ID' and count distinct 'BH_ID' using Spark DataFrame
# This code groups the 'overlap_raw_data' DataFrame by the 'PATIENT_ID' column.
# It then counts the number of distinct 'BH_ID' values for each 'PATIENT_ID'.
# The result is a new DataFrame 'patient_result' with two columns:
# 'PATIENT_ID' and 'HCP_ID_count', where 'HCP_ID_count' represents the count of distinct 'BH_ID' for each 'PATIENT_ID'.

patient_result = overlap_raw_data.groupBy('PATIENT_ID').agg(
    F.count_distinct('BH_ID').alias('HCP_ID_count')
).sort('HCP_ID_count', ascending=False)

display(patient_result.limit(10))
print("")        
display(patient_result.orderBy('HCP_ID_count').limit(10))

### Number of unique HCPs and Unique Patients since Jan-2023 till end of data for *Overlap* data table

In [0]:
# Filter the overlap data from Jan 2023 onwards
two_yr_overlap_data = overlap_raw_data.filter(overlap_raw_data['SHP_DT'] >= '2023-01-01')
print('Row count: '+ str(two_yr_overlap_data.count()))
#display(two_yr_overlap_data)

Checking the max date of data

In [0]:
# the data ends at which date?
two_yr_overlap_data.select(F.min("SHP_DT"), F.max("SHP_DT")).show()

### Number of unique HCPs and Patients in the overlap data since Jan-2023

In [0]:
# Number of unique HCPs and Patients in the overlap data since Jan-2023
columns_to_count = ["BH_ID", "PATIENT_ID"]
unique_counts = {col: two_yr_overlap_data.select(col).distinct().count() for col in columns_to_count}
display(unique_counts)

In [0]:
# Checking unique values present in the following columns
columns = ["PRD_NM", "MKT_NM", "PRD_GRP_NM"]
unique_vals = {col: overlap_raw_data.select(col).distinct().show() for col in columns}
display(unique_vals)

In [0]:
# converting the 2 years overlap spark dataframe to Pandas dataframe
two_yr_overlap_df = two_yr_overlap_data.toPandas()

### Number of HCPs and Patients with respect to Drugs in the data from Jan-2023 onwards

In [0]:
# Counting unique HCPs and patients for each drug
drugs = ['JIVI', 'KOVALTRY', 'ALTUVIIIO', 'HEMLIBRA']
hcp_counts = {drg: two_yr_overlap_df.query("PRD_NM == @drg").BH_ID.nunique() for drg in drugs}
pat_counts = {drg: two_yr_overlap_df.query("PRD_NM == @drg").PATIENT_ID.nunique() for drg in drugs}
df_hcps = pd.DataFrame(hcp_counts, index=[0])
display(df_hcps)
df_pats = pd.DataFrame(pat_counts, index=[0])
display(df_pats)

### Monthly IU totals for Jivi and Kovaltry

In [0]:
# Example usage
jivi_monthly_iu = calculate_monthly_iu_total(two_yr_overlap_df, 'JIVI')
display(jivi_monthly_iu)

In [0]:
# Example usage
kova_monthly_iu = calculate_monthly_iu_total(two_yr_overlap_df, 'KOVALTRY')
display(kova_monthly_iu)

In [0]:
monthly_iu = calculate_monthly_iu_total(two_yr_overlap_df)
display(monthly_iu)

In [0]:
# Shortlisting columns from the dataframe of overlap data
subset_overlap_df = overlap_data_df[['PRD_NM', 'SHP_DT', "BH_ID", "PATIENT_ID"]]

In [0]:
# checking the percentage of records for each drug
subset_overlap_df['PRD_NM'].value_counts(normalize=True) * 100

In [0]:
# checking the percentage of records for each drug in the study period
two_yr_overlap_df['PRD_NM'].value_counts(normalize=True) * 100

### We consider the study period of our data from Jan-2023 to Oct-2024 (end of data)

### EDA Q7: a. Monthly and overall count of 'New-to-JIVI' HCPs between Jan'23 and end of data date. 

**'New to JIVI' HCPs are defined as those who 1. Prescribed JIVI in the study period and 2. Didn't write JIVI in the 24 months prior**

In [0]:
# using the imported helper function
monthly_new_to_drg_hcps, new_to_drg_hcp_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='JIVI', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='BH_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_hcps)

**Retrieving records for a given HCP (randomly sampled HCP ID)**

In [0]:
sample_id = random.sample(list(new_to_drg_hcp_ids), 1)[0]
display(overlap_data_df.query('BH_ID == @sample_id'))

### EDA 6a. Monthly and overall counts of 'Jivi new patients' between Jan'23 and end of data date. 

**Jivi new patients are those who 1. had a Jivi Rx in the study period (Jan'23-end of data date) and 2. did not have Jivi Rx in the 24 months prior**

In [0]:
# using the imported helper function
monthly_new_to_drg_pats, new_to_drg_pat_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='JIVI', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='PATIENT_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_pats)

### Checking "new to JIVI" patient records to see if there are any patients who have switched the pharmacies

In [0]:

# Filter overlap data to get 'New-to-Brand' Patients data
new_to_drg_pats_data = overlap_data_df[overlap_data_df['PATIENT_ID'].isin(new_to_drg_pat_ids)]
new_to_drg_pats_data.sort_values(by=['PATIENT_ID', 'SHP_DT'], inplace= True)

# Flagging records where the WINNING PATIENT ID and SP_SOURCE_PTNT_ID are different and SRC_SP is not equal to 'SHS' to identify if the patient has switched the pharmacy
new_to_drg_pats_data['SP_switch_flag'] = ((new_to_drg_pats_data['WINNING_PATIENT_ID'] != new_to_drg_pats_data['SP_SOURCE_PTNT_ID']) & (new_to_drg_pats_data['SRC_SP'] != 'SHS')).astype(int)
res_df = new_to_drg_pats_data.groupby(['PATIENT_ID'])['SP_switch_flag'].max().reset_index()
display(res_df)

In [0]:
# Flagging records where the WINNING PATIENT ID and SP_SOURCE_PTNT_ID are different and SRC_SP is not equal to 'SHS' to identify if the patient has switched the pharmacy
overlap_data_df['SP_switch_flag'] = ((overlap_data_df['WINNING_PATIENT_ID'] != overlap_data_df['SP_SOURCE_PTNT_ID']) & (overlap_data_df['SRC_SP'] != 'SHS')).astype(int)

In [0]:
# Calculate the proportion of each unique value in the 'SRC_SP' column
src_sp_counts = overlap_data_df['SRC_SP'].value_counts(normalize=True).reset_index()
# Rename the columns for better understanding
src_sp_counts.columns = ['SRC_SP', 'Proportion']
# Convert the proportion to percentage
src_sp_counts['Proportion'] = src_sp_counts['Proportion'] * 100
# Display the resulting DataFrame
display(src_sp_counts)

**Filter the data for a given patient who has switched pharmacies**

In [0]:
pat_id = '0377CD39-3B12-489D-936E-37CE7C193910'
example_pat_data = overlap_data_df.query("PATIENT_ID == @pat_id")
# reordering columns for easy viewing
example_pat_data = example_pat_data[['PATIENT_ID', 'WINNING_PATIENT_ID', 'SP_SOURCE_PTNT_ID',  'SHP_DT', 'SP_switch_flag', 'BH_ID', 'SRC_SP', 'PRD_NM'] + [col for col in example_pat_data.columns if col not in ['PATIENT_ID', 'WINNING_PATIENT_ID', 'SP_SOURCE_PTNT_ID',  'SHP_DT', 'SP_switch_flag', 'BH_ID', 'SRC_SP', 'PRD_NM']]]
example_pat_data.sort_values(by=['SHP_DT'], inplace= True)
display(example_pat_data)

**Filter the data for the patient who has NOT switched pharmacies**

In [0]:
pat_id = '02F40E85-E18E-4BE6-B19A-F565869C1139'
example_pat_data = overlap_data_df.query("PATIENT_ID == @pat_id")
# reordering columns for easy viewing
example_pat_data = example_pat_data[['PATIENT_ID', 'WINNING_PATIENT_ID', 'SP_SOURCE_PTNT_ID',  'SHP_DT', 'SP_switch_flag', 'BH_ID', 'SRC_SP', 'PRD_NM'] + [col for col in example_pat_data.columns if col not in ['PATIENT_ID', 'WINNING_PATIENT_ID', 'SP_SOURCE_PTNT_ID',  'SHP_DT', 'SP_switch_flag', 'BH_ID', 'SRC_SP', 'PRD_NM']]]
example_pat_data.sort_values(by=['SHP_DT'], inplace= True)
display(example_pat_data)

### EDA Q7: b. Monthly and overall count of 'New-to-Kovaltry' HCPs between Jan'23 and end of data date. 

**'New to Kovaltry' HCPs are defined as those who 1. Prescribed Kovaltry in the study period and 2. Didn't write Kovaltry in the 24 months prior**

In [0]:
# using the imported helper function
monthly_new_to_drg_hcps, new_to_drg_hcp_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='KOVALTRY', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='BH_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_hcps)

### EDA 6b. Monthly and overall counts of 'Kovaltry new patients' between Jan'23 and end of data date. 

**Kovaltry new patients are those who 1. had a Kovaltry Rx in the study period (Jan'23-end of data date) and 2. did not have Kovaltry Rx in the 24 months prior**

In [0]:
# using the imported helper function
monthly_new_to_drg_pats, new_to_drg_pat_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='KOVALTRY', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='PATIENT_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_pats)

### Monthly and overall counts of 'New to Hemlibra' HCPs in study period

In [0]:
# using the imported helper function
monthly_new_to_drg_hcps, new_to_drg_hcp_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='HEMLIBRA', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='BH_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_hcps)

### Monthly and overall counts of 'New to ALTUVIIIO' HCPs in study period

In [0]:
# using the imported helper function
monthly_new_to_drg_hcps, new_to_drg_hcp_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='ALTUVIIIO', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='BH_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_hcps)

### Monthly and overall counts of 'New to Hemlibra' Patients in study period

In [0]:
# using the imported helper function
monthly_new_to_drg_pats, new_to_drg_pat_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='HEMLIBRA', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='PATIENT_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_pats)

%md
### Monthly and overall counts of 'New to ALTUVIIIO' Patients in study period

In [0]:
# using the imported helper function
monthly_new_to_drg_pats, new_to_drg_pat_ids = calculate_new_to_drug(data=subset_overlap_df, drg_nm='ALTUVIIIO', drg_nm_col='PRD_NM', start_date='2023-01-01', id_col='PATIENT_ID', date_col='SHP_DT', lookback_period=2)
display(monthly_new_to_drg_pats)