In [0]:
import sys
sys.path.append("../src")

from utils import *

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.functions import expr


In [0]:
cohort_1_link="../Dummy_Data/Cohort_1_synth.xlsx"
cohort_2_link="../Dummy_Data/Cohort_2_synth.xlsx"
cohort_3_link="../Dummy_Data/Cohort_3_synth.xlsx"

In [0]:
#cohort_1_df = spark.read.format("parquet").load(cohort_1_link)
cohort_1_df=pd.read_excel(cohort_1_link)

In [0]:
#cohort_2_df = spark.read.format("parquet").load(cohort_2_link")
cohort_2_df=pd.read_excel(cohort_2_link)

In [0]:
cohort_2_df = cohort_2_df.withColumn("wl_type", when(col("wlmds_type_changes_last").contains("IRTT"), "in").otherwise("op"))

cohort_2_df = cohort_2_df.withColumn(
    "waiting_group",
    when(col("ndl_wait_length") <= 18*7, "<18 weeks")
    .when((col("ndl_wait_length") > 18*7) & (col("ndl_wait_length") <= 36*7), "19-36 weeks")
    .when((col("ndl_wait_length") > 36*7) & (col("ndl_wait_length") <= 52*7), "37-52 weeks")
   .otherwise(">52 weeks"))
display(cohort_2_df)

In [0]:
speciality_comparison_df = cohort_2_df.groupBy("wl_type", "Specialty", "waiting_group").agg(count("*").alias("count"))

total_counts = cohort_2_df.groupBy("wl_type", "Specialty").agg(count("*").alias("total_count"))

speciality_comparison_df = speciality_comparison_df.join(total_counts, on=["wl_type", "Specialty"], how="left")

speciality_comparison_df = speciality_comparison_df.withColumn("percentage", (col("count") / col("total_count")) * 100)

speciality_comparison_df = speciality_comparison_df.withColumn("count", round(col("count"), 2))
speciality_comparison_df = speciality_comparison_df.withColumn("percentage", round(col("percentage"), 2))

speciality_comparison_pd_df = speciality_comparison_df.toPandas()

wl_types = speciality_comparison_pd_df['wl_type'].unique()
waiting_group_order = ["<18 weeks", "19-36 weeks", "37-52 weeks", ">52 weeks"]

for wl_type in wl_types:
    fig, ax = plt.subplots(figsize=(10, 6))
    df_filtered = speciality_comparison_pd_df[speciality_comparison_pd_df['wl_type'] == wl_type]
    df_pivot = df_filtered.pivot(index='Specialty', columns='waiting_group', values='percentage')
    df_pivot = df_pivot[waiting_group_order]
    df_pivot.plot(kind='bar', stacked=True, ax=ax)
    plt.title(f'Specialty Comparison by Waiting Group for {wl_type}')
    plt.xlabel('Specialty')
    plt.ylabel('Percentage')
    plt.legend(title='Waiting Group')
    plt.grid(True)
    plt.show()

In [0]:
cohort_2_tr=cohort_2_df.filter(col("wlmds_status")==30)

speciality_comparison_df = cohort_2_tr.groupBy("wl_type", "Specialty", "waiting_group").agg(count("*").alias("count"))

total_counts = cohort_2_tr.groupBy("wl_type", "Specialty").agg(count("*").alias("total_count"))

speciality_comparison_df = speciality_comparison_df.join(total_counts, on=["wl_type", "Specialty"], how="left")

speciality_comparison_df = speciality_comparison_df.withColumn("percentage", (col("count") / col("total_count")) * 100)

speciality_comparison_df = speciality_comparison_df.withColumn("count", round(col("count"), 2))
speciality_comparison_df = speciality_comparison_df.withColumn("percentage", round(col("percentage"), 2))

speciality_comparison_pd_df = speciality_comparison_df.toPandas()

wl_types = speciality_comparison_pd_df['wl_type'].unique()
waiting_group_order = ["<18 weeks", "19-36 weeks", "37-52 weeks", ">52 weeks"]

for wl_type in wl_types:
    fig, ax = plt.subplots(figsize=(10, 6))
    df_filtered = speciality_comparison_pd_df[speciality_comparison_pd_df['wl_type'] == wl_type]
    df_pivot = df_filtered.pivot(index='Specialty', columns='waiting_group', values='percentage')
    df_pivot = df_pivot[waiting_group_order]
    df_pivot.plot(kind='bar', stacked=True, ax=ax)
    plt.title(f'Specialty Comparison by Waiting Group for {wl_type}')
    plt.xlabel('Specialty')
    plt.ylabel('Percentage')
    plt.legend(title='Waiting Group')
    plt.grid(True)
    plt.show()


In [0]:
columns = ["ndl_age_band", "ndl_imd_quantile", "ndl_ethnicity", "ndl_ltc", "Sex", "Frailty_level"]
group_stats = calculate_wait_band_distribution_characteristics(cohort_2_tr.filter(col("wl_type")=='op'), columns)
group_stats_str = group_stats.toPandas().to_csv(index=False, sep=',', lineterminator='')
display(group_stats_str)

In [0]:
#cohort_3_df = spark.read.format("parquet").load(cohort_3_link)
cohort_3_df=pd.read_excel(cohort_3_link)

In [0]:
gynae_patients_df = cohort_3_df.filter(col("wlmds_treatment_function_code") == "502")
other_specialities_df = cohort_3_df.filter(col("wlmds_treatment_function_code") != "502")

overlapping_waiting_pathways_df = gynae_patients_df.alias("gynae").join(
    other_specialities_df.alias("other"),
    (col("gynae.wlmds_patient_id") == col("other.wlmds_patient_id")) &
    (( (col("gynae.wlmds_rtt_start_date_conc3") <= col("other.wlmds_rtt_start_date_conc3")) &
    (date_add(col("gynae.wlmds_rtt_start_date_conc3"),col("gynae.ndl_wait_length"))  >= col("other.wlmds_rtt_start_date_conc3"))) |
    ( (col("gynae.wlmds_rtt_start_date_conc3") >= col("other.wlmds_rtt_start_date_conc3")) &
    (col("gynae.wlmds_rtt_start_date_conc3") <= date_add(col("other.wlmds_rtt_start_date_conc3"), col("other.ndl_wait_length"))))) ,
    "inner"
).select(col("gynae.wlmds_patient_id")).distinct()

overlapping_count = overlapping_waiting_pathways_df.count()
total_gynae_patients = gynae_patients_df.select("wlmds_patient_id").distinct().count()

percentage_overlapping = (overlapping_count / total_gynae_patients) * 100

display(percentage_overlapping)

In [0]:
gynae_patients_df_alias = gynae_patients_df.alias("gynae")
other_specialities_df_alias = other_specialities_df.alias("other")

overlapping_tfc_df = gynae_patients_df.alias("gynae").join(
    other_specialities_df.alias("other"),
    (col("gynae.wlmds_patient_id") == col("other.wlmds_patient_id")) &
    (( (col("gynae.wlmds_rtt_start_date_conc3") <= col("other.wlmds_rtt_start_date_conc3")) &
    (date_add(col("gynae.wlmds_rtt_start_date_conc3"),col("gynae.ndl_wait_length"))  >= col("other.wlmds_rtt_start_date_conc3"))) |
    ( (col("gynae.wlmds_rtt_start_date_conc3") >= col("other.wlmds_rtt_start_date_conc3")) &
    (col("gynae.wlmds_rtt_start_date_conc3") <= date_add(col("other.wlmds_rtt_start_date_conc3"), col("other.ndl_wait_length"))))) ,
    "inner"
).select("other.wlmds_treatment_function_code").groupBy("wlmds_treatment_function_code").count()

display(overlapping_tfc_df)

In [0]:
# Count patients with single and multiple pathways for the same speciality
single_pathway_count = gynae_patients_df.groupBy("wlmds_patient_id").count().filter(col("count") == 1).count()
multiple_pathways_same_speciality_count = gynae_patients_df.groupBy("wlmds_patient_id").count().filter(col("count") > 1).count()

# Count patients with overlapping pathways with other specialities
overlapping_count = overlapping_waiting_pathways_df.count()

# Calculate percentages
total_gynae_patients = gynae_patients_df.select("wlmds_patient_id").distinct().count()
single_pathway_percentage = (single_pathway_count / total_gynae_patients) * 100
multiple_pathways_same_speciality_percentage = (multiple_pathways_same_speciality_count / total_gynae_patients) * 100
overlapping_percentage = (overlapping_count / total_gynae_patients) * 100

# Data for plotting
labels = ['Single Pathway', 'Multiple Pathways (Same Speciality)', 'Overlapping Pathways (Other Specialities)']
percentages = [single_pathway_percentage, multiple_pathways_same_speciality_percentage, overlapping_percentage]

# Plotting
fig, ax = plt.subplots()
ax.bar(labels, percentages, color=['blue', 'orange', 'green'])
ax.set_ylabel('Percentage of Patients')
ax.set_title('Patient Pathways Distribution')
ax.set_xticklabels(labels, rotation=45, ha='right')



In [0]:
def calculate_wait_band_distribution_characteristics(df, input_cols):
    """
    Calculate the distribution of wait bands for the given input columns,
    including counts and percentages for each wait band value, and total counts
    for each input column value, renaming null values in the input columns to "unknown".
 
    Args:
        df (DataFrame): Input DataFrame.
        input_cols (list): List of column names for the input grouping.
 
    Returns:
        DataFrame: A DataFrame containing input column values, total counts,
                   and affixed columns with each wait band value: count and percentage.

    """

    personal_ch = df.groupBy(input_cols + ["wlmds_patient_id", "wlmds_treatment_function_code", "wlmds_rtt_start_date_conc3" ,"waiting_group"]).agg(count("*").alias("count"))

    grouped_counts_list = []

    for var in input_cols:
        personal_ch = personal_ch.withColumn(var, when(col(var).isNull(), "unknown").otherwise(col(var)))
        grouped_counts = personal_ch.groupBy(var).pivot("waiting_group").agg(count("*").alias("count")).withColumnRenamed(var, "value")
        total_counts = personal_ch.groupBy(var).agg(count("*").alias("total_count")).withColumnRenamed(var, "value")
        grouped_counts = grouped_counts.join(total_counts, on="value", how="left")
        for c in grouped_counts.columns:
            if c not in ["value", "total_count"]:
                grouped_counts = grouped_counts.withColumn(f"{c}_percentage", (coalesce(col(c), lit(0)) / col("total_count")) * 100)
                grouped_counts = grouped_counts.withColumn(c, round(coalesce(col(c), lit(0)), 2))
                grouped_counts = grouped_counts.withColumn(f"{c}_percentage", round(col(f"{c}_percentage"), 2))
        grouped_counts = grouped_counts.withColumn("Variable", lit(var))
        grouped_counts_list.append(grouped_counts)

    grouped_counts = reduce(lambda df1, df2: df1.unionByName(df2), grouped_counts_list)
        
    return grouped_counts

In [0]:
columns = ["ndl_age_band", "ndl_imd_quantile", "ndl_ethnicity", "ndl_ltc", "Sex", "Frailty_level"]
group_stats = calculate_wait_band_distribution_characteristics(cohort_3_df, columns)
group_stats_str = group_stats.toPandas().to_csv(index=False, sep=',', lineterminator='')
display(group_stats_str)

In [0]:
display(cohort_3_df.groupBy("Specialty").count())

In [0]:
# Create waiting group based on the waiting length
cohort_3_df = cohort_3_df.withColumn(
    "waiting_group",
    when(col("ndl_wait_length") <= 18*7, "<18 weeks")
    .when((col("ndl_wait_length") > 18*7) & (col("ndl_wait_length") <= 36*7), "19-36 weeks")
    .when((col("ndl_wait_length") > 36*7) & (col("ndl_wait_length") <= 52*7), "37-52 weeks")
   .otherwise(">52 weeks")).filter(col("Specialty")=="Gynaecology")

In [0]:
reshaped_df = cohort_3_df.selectExpr(
    "Specialty as specialty",
    "wlmds_patient_id",
    "ndl_wait_length",
    "waiting_group",
    "stack(36, "
    "'gp_healthcare_use_sum_3m_before', gp_healthcare_use_sum_3m_before, "
    "'u111_healthcare_use_sum_3m_before', u111_healthcare_use_sum_3m_before, "
    "'u999_healthcare_use_sum_3m_before', u999_healthcare_use_sum_3m_before, "
    "'u00H_healthcare_use_sum_3m_before', u00H_healthcare_use_sum_3m_before, "
    "'ae_healthcare_use_sum_3m_before', ae_healthcare_use_sum_3m_before, "
    "'nel_healthcare_use_sum_3m_before', nel_healthcare_use_sum_3m_before, "
    "'el_healthcare_use_sum_3m_before', el_healthcare_use_sum_3m_before, "
    "'op_healthcare_use_sum_3m_before', op_healthcare_use_sum_3m_before, "
    "'gp_healthcare_use_sum_waiting_time', gp_healthcare_use_sum_waiting_time, "
    "'u111_healthcare_use_sum_waiting_time', u111_healthcare_use_sum_waiting_time, "
    "'u999_healthcare_use_sum_waiting_time', u999_healthcare_use_sum_waiting_time, "
    "'u00H_healthcare_use_sum_waiting_time', u00H_healthcare_use_sum_waiting_time, "
    "'ae_healthcare_use_sum_waiting_time', ae_healthcare_use_sum_waiting_time, "
    "'nel_healthcare_use_sum_waiting_time', nel_healthcare_use_sum_waiting_time, "
    "'el_healthcare_use_sum_waiting_time', el_healthcare_use_sum_waiting_time, "
    "'op_healthcare_use_sum_waiting_time', op_healthcare_use_sum_waiting_time, "
    "'gp_healthcare_use_sum_3m_after', gp_healthcare_use_sum_3m_after, "
    "'u111_healthcare_use_sum_3m_after', u111_healthcare_use_sum_3m_after, "
    "'u999_healthcare_use_sum_3m_after', u999_healthcare_use_sum_3m_after, "
    "'u00H_healthcare_use_sum_3m_after', u00H_healthcare_use_sum_3m_after, "
    "'ae_healthcare_use_sum_3m_after', ae_healthcare_use_sum_3m_after, "
    "'nel_healthcare_use_sum_3m_after', nel_healthcare_use_sum_3m_after, "
    "'el_healthcare_use_sum_3m_after', el_healthcare_use_sum_3m_after, "
    "'op_healthcare_use_sum_3m_after', op_healthcare_use_sum_3m_after, "
   # "'op_Total_Cost_3m_before', op_Total_Cost_3m_before, "
    #"'ae_Total_Cost_3m_before', ae_Total_Cost_3m_before, "
    #"'gp_Total_Cost_3m_before', gp_Total_Cost_3m_before, "
    #"'el_Total_Cost_3m_before', el_Total_Cost_3m_before, "
    #"'nel_Total_Cost_3m_before', nel_Total_Cost_3m_before, "
    #"'all_pres_sum_3m_before', all_pres_sum_3m_before, "
    "'antib_pres_sum_3m_before', antib_pres_sum_3m_before, "
    "'antidep_pres_sum_3m_before', antidep_pres_sum_3m_before, "
    "'pain_pres_sum_3m_before', pain_pres_sum_3m_before, "
    "'sick_note_sum_3m_before', sick_note_sum_3m_before, "
    #"'op_Total_Cost_waiting_time', op_Total_Cost_waiting_time, "
    #"'ae_Total_Cost_waiting_time', ae_Total_Cost_waiting_time, "
    #"'gp_Total_Cost_waiting_time', gp_Total_Cost_waiting_time, "
    #"'el_Total_Cost_waiting_time', el_Total_Cost_waiting_time, "
    #"'nel_Total_Cost_waiting_time', nel_Total_Cost_waiting_time, "
    #"'all_pres_sum_waiting_time', all_pres_sum_waiting_time, "
    "'antib_pres_sum_waiting_time', antib_pres_sum_waiting_time, "
    "'antidep_pres_sum_waiting_time', antidep_pres_sum_waiting_time, "
    "'pain_pres_sum_waiting_time', pain_pres_sum_waiting_time, "
    "'sick_note_sum_waiting_time', sick_note_sum_waiting_time, "
    #"'op_Total_Cost_3m_after', op_Total_Cost_3m_after, "
    #"'ae_Total_Cost_3m_after', ae_Total_Cost_3m_after, "
    #"'gp_Total_Cost_3m_after', gp_Total_Cost_3m_after, "
    #"'el_Total_Cost_3m_after', el_Total_Cost_3m_after, "
    #"'nel_Total_Cost_3m_after', nel_Total_Cost_3m_after, "
    #"'all_pres_sum_3m_after', all_pres_sum_3m_after, "
    "'antib_pres_sum_3m_after', antib_pres_sum_3m_after, "
    "'antidep_pres_sum_3m_after', antidep_pres_sum_3m_after, "
    "'pain_pres_sum_3m_after', pain_pres_sum_3m_after, "
    "'sick_note_sum_3m_after', sick_note_sum_3m_after"
    ") as (metric, healthcare_utilization)"
).withColumn(
    "delivery_point", expr(
        "CASE "
        "WHEN metric LIKE 'gp%' THEN 'GP' "
        "WHEN metric LIKE 'u111%' THEN 'U111' "
        "WHEN metric LIKE 'u999%' THEN 'U999' "
        "WHEN metric LIKE 'u00H%' THEN 'U00H' "
        "WHEN metric LIKE 'ae%' THEN 'AE' "
        "WHEN metric LIKE 'nel%' THEN 'NEL' "
        "WHEN metric LIKE 'el%' THEN 'EL' "
        "WHEN metric LIKE 'op%' THEN 'OP' "
       # "WHEN metric LIKE 'all_pres%' THEN 'All Prescriptions' "
        "WHEN metric LIKE 'antib_pres%' THEN 'Antibiotic Prescriptions' "
        "WHEN metric LIKE 'antidep_pres%' THEN 'Antidepressant Prescriptions' "
        "WHEN metric LIKE 'pain_pres%' THEN 'Pain Prescriptions' "
        "WHEN metric LIKE 'sick_note%' THEN 'Sick Notes' "
        "END"
    )
).withColumn(
    "time_period", expr(
        "CASE "
        "WHEN metric LIKE '%3m_before' THEN '3 months before' "
        "WHEN metric LIKE '%waiting_time' THEN 'during waiting' "
        "WHEN metric LIKE '%3m_after' THEN '3 months after' "
        "END"
    )
).drop("metric")

display(reshaped_df)

In [0]:
reshaped_df = reshaped_df.withColumn(
    "ndl_wait_length",
    when(col("time_period").contains("3 months"), 7*13).otherwise(col("ndl_wait_length"))
)

display(reshaped_df)

In [0]:
df_total_hc = reshaped_df.toPandas()

# Group by delivery_point and time_period, summing healthcare utilization
df_total_hc = df_total_hc[df_total_hc['time_period'] != '6 months before']
grouped = df_total_hc.groupby(['delivery_point', 'time_period'])['healthcare_utilization'].sum().reset_index()

# Pivot the data for better visualization
pivot = grouped.pivot(index='delivery_point', columns='time_period', values='healthcare_utilization')

# Reorder columns
pivot = pivot[['3 months before', 'during waiting', '3 months after']]

# Plotting
plt.figure(figsize=(10, 6))
pivot.plot(kind='bar', stacked=False, figsize=(10, 6))

plt.title("Total Healthcare Utilization per Delivery Point Across Time Periods", fontsize=16)
plt.xlabel("Delivery Point", fontsize=14)
plt.ylabel("Total Healthcare Utilization", fontsize=14)
plt.legend(title="Time Period", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [0]:
df_total_hc = reshaped_df.toPandas()

# Group by delivery_point, time_period, and waiting_group, summing healthcare utilization
df_total_hc = df_total_hc[df_total_hc['time_period'] != '6 months before']
grouped = df_total_hc.groupby(['delivery_point', 'time_period', 'waiting_group'])['healthcare_utilization'].sum().reset_index()

# Pivot the data for better visualization
pivot = grouped.pivot_table(index=['delivery_point', 'waiting_group'], columns='time_period', values='healthcare_utilization', aggfunc='sum').reset_index()

# Reorder columns
pivot = pivot[['delivery_point', 'waiting_group', '3 months before', 'during waiting', '3 months after']]

# Plotting separate plots for each waiting group
waiting_groups = pivot['waiting_group'].unique()
for group in waiting_groups:
    group_data = pivot[pivot['waiting_group'] == group].set_index('delivery_point')
    group_data[['3 months before', 'during waiting', '3 months after']].plot(kind='bar', stacked=False, figsize=(10, 6))
    
    plt.title(f"Total Healthcare Utilization per Delivery Point Across Time Periods for Waiting Group {group}", fontsize=16)
    plt.xlabel("Delivery Point", fontsize=14)
    plt.ylabel("Total Healthcare Utilization", fontsize=14)
    plt.legend(title="Time Period", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [0]:
df_total_hc = reshaped_df.toPandas()

# Group by delivery_point, time_period, and waiting_group, summing healthcare utilization and counting occurrences
df_total_hc = df_total_hc[df_total_hc['time_period'] != '6 months before']
grouped = df_total_hc.groupby(['delivery_point', 'time_period', 'waiting_group']).agg(
    healthcare_utilization_sum=('healthcare_utilization', 'sum'),
    ndl_wait_length_sum=('ndl_wait_length', 'sum'),
    count=('healthcare_utilization', 'count')
).reset_index()

grouped['rate_per_1000'] = (grouped['healthcare_utilization_sum'] / (grouped['ndl_wait_length_sum']/7)) * 1000

# Pivot the data for better visualization
pivot_1 = grouped.pivot_table(index=['delivery_point', 'waiting_group'], 
                            columns='time_period', 
                            values='rate_per_1000').reset_index()

# Reorder columns
pivot_1 = pivot_1[['delivery_point', 'waiting_group', '3 months before', 'during waiting', '3 months after']]

# Determine the common y-axis scale
y_max = pivot_1[['3 months before', 'during waiting', '3 months after']].max().max()

# Plotting separate plots for each waiting group
waiting_groups = pivot_1['waiting_group'].unique()
for group in waiting_groups:
    group_data = pivot_1[pivot_1['waiting_group'] == group].set_index('delivery_point')
    ax = group_data[['3 months before', 'during waiting', '3 months after']].plot(kind='bar', stacked=False, figsize=(10, 6))
    
    plt.title(f"Healthcare Utilization Rate per 1000 per Delivery Point Across Time Periods for Waiting Group {group}", fontsize=16)
    plt.xlabel("Delivery Point", fontsize=14)
    plt.ylabel("Healthcare Utilization Rate per 1000", fontsize=14)
    plt.legend(title="Time Period", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, y_max)
    plt.tight_layout()
    plt.show()

In [0]:
grouped_str = grouped.to_string()
display(grouped_str)