In [0]:
# Importing packages
from pyspark.sql import functions as F  # Importing functions from pyspark.sql
import pandas as pd

In [0]:
%run "../00_config/set-up"

In [0]:
# Shortlisting columns which are useful for data analysis. Generally we exclude columns which have a high percentage of null values.
col_shortlist = [
  'BAYER_HCP_ID',
  'CALL_DATE',
  'CALL_MONTH',
  'PROD_BRAND_NM',
]

In [0]:
# Reading the filtered data based on Field force codes for Hematology
call_data_sdf = spark.sql("SELECT * FROM heme_data.call_activity_data WHERE FLD_FORCE_ID in ('14','110')")
print(
    "Row count: ",
    call_data_sdf.count(),
    "Column Count: ",
    len(call_data_sdf.columns),
)

In [0]:
# Ordering the dataset based on BAYER_HCP_ID and CALL_DATE
call_data_sdf = call_data_sdf.orderBy('BAYER_HCP_ID', 'CALL_DATE')

### Performing some checks on the quality of data

In [0]:
# Displaying the count of duplicate records
duplicate_records_count = call_data_sdf.groupBy(call_data_sdf.columns).count().filter("count > 1").count()
display(duplicate_records_count)

In [0]:
print("Count of records where CUST_HCP_ID is null: ", call_data_sdf.filter(call_data_sdf['CUST_HCP_ID'].isNull()).count())
print("Count of records where BAYER_HCP_ID is null: ", call_data_sdf.filter(call_data_sdf['BAYER_HCP_ID'].isNull()).count())

In [0]:
display(call_data_sdf.filter(call_data_sdf['CUST_HCP_ID'] != call_data_sdf['BAYER_HCP_ID']))

In [0]:
call_data_sdf.select("CUST_HCP_ID", "BAYER_HCP_ID").agg(
    F.countDistinct("CUST_HCP_ID").alias("unique_CUST_HCP_ID_cnt"),
    F.countDistinct("BAYER_HCP_ID").alias("unique_BAYER_HCP_ID_cnt")
).display()

In [0]:
display(call_data_sdf.filter(F.col('BAYER_HCP_ID') == -1))

### Some EDA

In [0]:
display(call_data_sdf.select('CALL_CHNL_NM').groupBy('CALL_CHNL_NM').count())

In [0]:
display(call_data_sdf.select('PROD_BRAND_NM').groupBy('PROD_BRAND_NM').count())

In [0]:
call_data_sdf = call_data_sdf.withColumn('CALL_MONTH', F.date_format('CALL_DATE', 'yyyy-MM'))
monthly_call_counts = call_data_sdf.groupBy('CALL_MONTH').count().orderBy('CALL_MONTH')
display(monthly_call_counts)

In [0]:
# just checking monthly data
filtered_monthly_call_counts = monthly_call_counts.filter((monthly_call_counts.CALL_MONTH >= '2023-01') & (monthly_call_counts.CALL_MONTH <= '2024-11'))
print(filtered_monthly_call_counts.count())

**Writing the calls activity data to the Hivestore with only required columns and filtered on Hematology field force codes**

In [0]:
call_data_subset = call_data_sdf[col_shortlist]

In [0]:
save_sdf(call_data_subset, 'heme_data', 'call_activity_data_preprocessed')