In [0]:
%run "../00_config/set-up"

In [0]:
overlap_hcp_hco_reference = spark.sql("SELECT * FROM jivi_new_writer_model.overlap_hcp_hco_reference")

In [0]:
overlap_raw_data = spark.sql("SELECT * FROM heme_data.overlap_preprocessed")

In [0]:
sdf = get_data_snowflake(
f"""
  SELECT * FROM CPH_DB_PROD.ANALYTICS_V2.ANLT_BASE_FACT_NONPERSONAL_PROMOTIONS
  WHERE PROD_BRAND_NM IN ('JIVI','KOVALTRY','KOGENATE FS')
"""
)
print(sdf.count(),len(sdf.columns))
sdf.printSchema()

In [0]:
sdf_digital = sdf.select('BAYER_HCP_ID','DATE_ID','PROD_BRAND_NM','CHNL_ID','XPSRE_CNT','LIGHT_ENGMNT_CNT','DEEP_ENGMNT_CNT','TOTL_ENGMNT_CNT')

In [0]:
print(sdf_digital.count())
sdf_digital.display()

In [0]:
from pyspark.sql import functions as F

# Print the shape of the DataFrame
print((sdf_digital.count(), len(sdf_digital.columns)))
# Drop rows with null values in 'BAYER_HCP_ID'
sdf_digital = sdf_digital.na.drop(subset=['BAYER_HCP_ID'])
# Filter out rows where 'BAYER_HCP_ID' is -1 or '-1'
sdf_digital = sdf_digital.filter((sdf_digital['BAYER_HCP_ID'] != -1) | (sdf_digital['BAYER_HCP_ID'] != '-1'))
# Print the shape of the DataFrame after filtering
print((sdf_digital.count(), len(sdf_digital.columns)))
# Show the first few rows of the DataFrame
sdf_digital.show()

In [0]:
overlap_hcp_hco_reference_pd = overlap_hcp_hco_reference.toPandas()

In [0]:
# Get unique BH_ID values from overlap_hcp_hco_reference and convert to a list
hcp_list = overlap_hcp_hco_reference.select('BH_ID').distinct().rdd.flatMap(lambda x: x).collect()
# Filter df_digital where BAYER_HCP_ID is in hcp_list
sdf_digital = sdf_digital.filter(sdf_digital['BAYER_HCP_ID'].isin(hcp_list))
# Transform DATE_ID to the desired format
sdf_digital = sdf_digital.withColumn('DATE_ID', 
                                      F.concat(F.substring('DATE_ID', 1, 4), 
                                                F.lit('-'), 
                                                F.substring('DATE_ID', 5, 2), 
                                                F.lit('-'), 
                                                F.substring('DATE_ID', 7, 2)))
# Convert DATE_ID to DateType
sdf_digital = sdf_digital.withColumn('DATE_ID', F.to_date('DATE_ID', 'yyyy-MM-dd'))
# Print the shape of the DataFrame
print((sdf_digital.count(), len(sdf_digital.columns)))
# Show the first few rows of the DataFrame
sdf_digital.display()

In [0]:
sdf_digital = sdf_digital.withColumn("MONTH_ID",F.date_format("DATE_ID", "yyyy-MM"))

In [0]:
from pyspark.sql.functions import col, concat, lit

In [0]:

sdf_digital = (sdf_digital.filter(col("PROD_BRAND_NM") == 'JIVI')
    .groupBy(['BAYER_HCP_ID', 'MONTH_ID', 'CHNL_ID']).agg(
    F.sum('XPSRE_CNT').alias('XPSRE_CNT'),
    F.sum('LIGHT_ENGMNT_CNT').alias('LIGHT_ENGMNT_CNT'),
    F.sum('DEEP_ENGMNT_CNT').alias('DEEP_ENGMNT_CNT'),
    F.sum('TOTL_ENGMNT_CNT').alias('TOTL_ENGMNT_CNT')
))

# Print the shape of the DataFrame
print((sdf_digital.count(), len(sdf_digital.columns)))

# Show the first few rows of the DataFrame
sdf_digital.show()

In [0]:
sdf_digital = sdf_digital.withColumnRenamed("BAYER_HCP_ID",'BH_ID')

In [0]:
unique_month = overlap_raw_data.select(col("SHP_YR_MO").alias("COHORT_MONTH")).filter(col("COHORT_MONTH")>='2023-01').distinct()
hcp_month_pair = (sdf_digital.select("BH_ID").distinct()
                  .crossJoin(unique_month)
                  .orderBy('BH_ID','COHORT_MONTH'))

In [0]:
sdf_digital_explode = (
  hcp_month_pair.join(sdf_digital, on = ['BH_ID'], how = 'left')
  .orderBy('BH_ID','COHORT_MONTH')
  .fillna(0)
  )

In [0]:
sdf_digital_explode.display()

In [0]:
sdf_digital_explode = (sdf_digital_explode.withColumn("month_diff",F.months_between(F.col("COHORT_MONTH"), F.col("MONTH_ID")))
                                         .withColumn("ROLLING_WIN", F.when((F.col("month_diff")<=1)&(F.col("month_diff")>0), 'LAST_1M')
                                                                    .when((F.col("month_diff")<=3)&(F.col("month_diff")>0), 'LAST_3M')
                                                                     .when((col("month_diff")<=6)&(col("month_diff")>0), 'LAST_6M')
                                                                     .when((col("month_diff")<=9)&(col("month_diff")>0), 'LAST_9M')
                                                                     .when((col("month_diff")<=12)&(col("month_diff")>0), 'LAST_12M').otherwise('NA') ))

In [0]:
sdf_digital_explode = (sdf_digital_explode
                                          .filter(col("ROLLING_WIN")!='NA')
                                          .groupby("BH_ID", "COHORT_MONTH", "CHNL_ID",'ROLLING_WIN')
                                          .agg(sum("XPSRE_CNT").alias('XPSRE_CNT'),
                                               sum("LIGHT_ENGMNT_CNT").alias('LIGHT_ENGMNT_CNT'),
                                               sum("DEEP_ENGMNT_CNT").alias('DEEP_ENGMNT_CNT'),
                                               sum("TOTL_ENGMNT_CNT").alias('TOTL_ENGMNT_CNT'))
                  .orderBy('BH_ID','COHORT_MONTH'))

In [0]:
def long_to_wide(df, group_cols, pivot_col, agg_col, agg_func):
    """
    group_cols: groupby columns

    """
    df_pivot = (df.groupBy(group_cols)
                  .pivot(pivot_col)
                  .agg({agg_col: agg_func})).cache()
    # Rename the columns to add the category names to column names

    renamed_columns = [col(c).alias(f"{agg_col}_{c}") if c not in group_cols else col(c)
                       for c in df_pivot.columns]

    df_pivot = df_pivot.select(
    *renamed_columns).cache()

   
    return df_pivot

In [0]:
sdf_digital_explode = sdf_digital_explode.withColumn(
    'CHNL_ID',
    F.when(sdf_digital_explode['CHNL_ID'] == 'HQ Email', 'HQ_EMAIL')
     .when(sdf_digital_explode['CHNL_ID'] == 'iRep Email', 'IREP_EMAIL')
     .otherwise(sdf_digital_explode['CHNL_ID'])  # Keep original value if no match
)

In [0]:
sdf_digital_explode = sdf_digital_explode.filter(col("CHNL_ID").isin(['HQ_EMAIL','IREP_EMAIL','BANNER']))

In [0]:
sdf_digital_explode = sdf_digital_explode.withColumn("CHNL_ID_WIN",concat(col("CHNL_ID"), lit("_"), col("ROLLING_WIN")))

In [0]:
sdf_digital_explode.select(col("CHNL_ID")).distinct().show()

In [0]:
display(sdf_digital_explode.limit(20))

In [0]:
sdf_digital_explode.columns

In [0]:
## change to wide format
group_cols = ["BH_ID", "COHORT_MONTH"]
pivot_col = "CHNL_ID_WIN"
sdf_digital_explode_wide = sdf_digital_explode.groupBy(group_cols) \
    .pivot(pivot_col) \
    .agg(
        F.first("XPSRE_CNT").alias("XPSRE_CNT"),
        F.first("LIGHT_ENGMNT_CNT").alias("LIGHT_ENGMNT_CNT"),
        F.first("DEEP_ENGMNT_CNT").alias("DEEP_ENGMNT_CNT"),
        F.first("TOTL_ENGMNT_CNT").alias("TOTL_ENGMNT_CNT")
    )
sdf_digital_explode_wide = sdf_digital_explode_wide.fillna(0)

In [0]:
display(sdf_digital_explode_wide)

In [0]:
sdf_digital_explode_wide.columns[2:]

In [0]:
result = sdf_digital_explode_wide.groupBy('COHORT_MONTH').agg(
    *[F.count(F.col(c)).alias(f'{c}_count') for c in sdf_digital_explode_wide.columns[2:]] +
    [F.mean(F.col(c)).alias(f'{c}_mean') for c in sdf_digital_explode_wide.columns[2:]] +
    [F.stddev(F.col(c)).alias(f'{c}_stddev') for c in sdf_digital_explode_wide.columns[2:]] +
    [F.min(F.col(c)).alias(f'{c}_min') for c in sdf_digital_explode_wide.columns[2:]] +
    [F.max(F.col(c)).alias(f'{c}_max') for c in sdf_digital_explode_wide.columns[2:]] +
    [F.sum(F.col(c)).alias(f'{c}_sum') for c in sdf_digital_explode_wide.columns[2:]]
)

# Show the result


In [0]:
result.display()

In [0]:
result_df = result.toPandas()

In [0]:
result_df.columns

In [0]:
# Plotting
plt.figure(figsize=(10, 6))

for col in result_df.columns[1:]:
  for index, row in result_df.iterrows():
      plt.hist(row[col], bins=30, alpha=0.5, label=str(row['COHORT_MONTH']))

plt.title('Histogram by Group')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(title='COHORT_MONTH')
plt.show()

In [0]:
distribution by month

In [0]:
sdf_digital_explode_wide.columns

In [0]:
# saving the hcp monthly features to the Hivestore
save_sdf(sdf_digital_explode_wide, 'jivi_new_writer_model', 'hcp_monthly_digital_features')

In [0]:
df_digital = df_digital.groupby(['BAYER_HCP_ID','DATE_ID','CHNL_ID']).agg('sum')[['XPSRE_CNT','LIGHT_ENGMNT_CNT','DEEP_ENGMNT_CNT','TOTL_ENGMNT_CNT']].reset_index()
print(df_digital.shape)
df_digital.head()

In [0]:
df_digital_merged = df_digital.merge(df,left_on='BAYER_HCP_ID',right_on='BH_ID',how='right')
print(df_digital.shape)
print(df.shape)
print(df_digital_merged.shape)
df_digital_merged.head()