## EDA

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pandas as pd
import plotly.express as px

In [0]:
# Tier distribution by Education Level & Income Group

df=spark.table("XXX").toPandas()

for c in ['XXX', 'XXX']:
    tmp = (df
           .groupby(['Tier', c], dropna=False)
           .size()
           .reset_index(name='count'))
    
    tmp['% of Tier'] = tmp['count'] / tmp.groupby('Tier')['count'].transform('sum')*100

    fig = px.bar(tmp,
        x=c,
        y='% of Tier',
        color="Tier",
        barmode="group",         
        text_auto=False,
        title=f"Distribution of {c} across Tiers",
        labels={c:"Education Level","count":"x"},
        #labels={c:"Income Group","count":"x"},
    )
    fig.update_layout(
        template="plotly_white",
        bargap=0.05,
        xaxis_tickangle=-30,
        height=500,
        width=900,
          yaxis=dict(showticklabels=False),
                   showlegend=False)
    
    fig.show()

In [0]:
# Average Loss Ratio by Occupation

avg_df = (
    pdf.groupby("Occupation_-_Person", as_index=False)
       .agg(avg_loss_ratio=("LossRatio", "mean"),
            count=("LossRatio", "size"))
)

fig = px.bar(
    avg_df.sort_values("avg_loss_ratio", ascending=False),
    x="Occupation_-_Person",
    y="avg_loss_ratio",
    color="avg_loss_ratio",
    title=f"Average Loss Ratio by Occupation",
    labels={"avg_loss_ratio": "Average Loss Ratio", "Occupation_-_Person": "Occupation"}
)
fig.update_layout(template="plotly_white", xaxis_tickangle=-45, height=600, width=900, showlegend=False,#xaxis=dict(showticklabels=False),
                   yaxis=dict(showticklabels=False))

overall_mean = pdf["LossRatio"].mean()

fig.add_hline(y=overall_mean, line_color="black", line_dash="dash",
              annotation_text="Overall mean", annotation_position="top left")

fig.show()


In [0]:
# Behavior propensities by tier

TIER_COL = "XXX"

# Combine
heat = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(columns=["behavior", TIER_COL, "pct"])

heat = heat.head(50)

# --- ANONYMIZE LABELS ---
unique_behaviors = heat["behavior"].unique().tolist()
unique_tiers = heat[TIER_COL].unique().tolist()

anon_tier_map = {b: f"Tier {chr(65+i)}" for i, b in enumerate(unique_tiers)}
anon_behavior_map  = {t: f"Behavior {i+1}" for i, t in enumerate(unique_behaviors)}

heat["behavior_anon"] = heat["behavior"].map(anon_behavior_map)
heat["tier_anon"] = heat[TIER_COL].map(anon_tier_map)

# --- HEATMAP ---
fig = px.density_heatmap(
    heat,
    x="tier_anon",
    y="behavior_anon",
    z="pct",
    color_continuous_scale="Viridis",
   # title="Relative High-Propensity Share by Tier and Behavior"
)

# --- LAYOUT CLEANUP ---
fig.update_layout(
    template="plotly_white",
    xaxis_title="Tier",
    yaxis_title="Behavior",
    height=450 + 20 * len(unique_behaviors),
    width=1000,
    coloraxis_showscale=True

)

fig.update_coloraxes(

        colorbar=dict(
            title="",
            tickvals=[heat["pct"].min(), heat['pct'].max()],
            ticktext=["Low", "High"],
            ticks='outside',
            showticklabels=True)
)

# Optional: hide axis tick labels for masking
#fig.update_xaxes(showticklabels=False)
#fig.update_yaxes(showticklabels=False)

fig.show()

In [0]:
# Tier distribution by policy renewal month

month_tier = (
    df.groupBy('Tier', "RenewalMonth")
    .agg(F.count("*").alias("count"))
)
month_tier = month_tier.withColumn(
    "pct",
    F.col("count") / F.sum("count").over(Window.partitionBy("Tier"))
)

pdf = month_tier.orderBy("Tier","RenewalMonth").toPandas()

rand = random.randint(1, 3)

anon_tier_map = {b: f"Tier {chr(65+i*rand)}" for i, b in enumerate(unique_tiers)}
anon_month_map  = {t: f"Renewal Month {chr(65+i*rand)}" for i, t in enumerate(pdf['RenewalMonth'].unique().tolist())}

pdf["month_label"] = pdf["RenewalMonth"].map(anon_month_map)
pdf['Tier'] = pdf['Tier'].map(anon_tier_map)

fig = px.bar(
    pdf,
    x="month_label",
    y="pct",
    color='Tier',
    barmode="stack",
    #text_auto=".1%",
    title="Tier composition by Auto Policy Renewal Month",
    labels={"month_label":"Renewal Month", "pct":"", "Tier":"Tier"}
  
)
fig.update_layout(template="plotly_white", height=600, width=900, showlegend=False,yaxis=dict(showticklabels=False),)
fig.show()


In [0]:


# Heatmap - corr between age bucket, income bucket & loss ratio

sdf_demo = df.withColumn("AGE_BUCKET", F.when(F.col("Age_in_One-Year_Increments_-_Person") < 30, "<30")
                                           .when(F.col("Age_in_One-Year_Increments_-_Person") < 50, "30-49")
                                           .when(F.col("Age_in_One-Year_Increments_-_Person") < 70, "50-69")
                                           .otherwise("70+"))

heat = (sdf_demo.groupBy("AGE_BUCKET", "Income_-_Estimated_Household_-_Broad_Ranges")
         .agg(F.mean("LossRatio").alias("avg_loss_ratio"))
         .toPandas())

# --- ANONYMIZE LABELS ---
u_a = heat["AGE_BUCKET"].unique().tolist()
u_i = heat["Income_-_Estimated_Household_-_Broad_Ranges"].unique().tolist()

u_a_map = {b: f"Age Group {chr(65+i*rand)}" for i, b in enumerate(u_a)}
u_i_map  = {t: f"Income Group {chr(65+i*rand)}" for i, t in enumerate(u_i)}

heat["age_anon"] = heat["AGE_BUCKET"].map(u_a_map)
heat["inc_anon"] = heat["Income_-_Estimated_Household_-_Broad_Ranges"].map(u_i_map)

fig = px.density_heatmap(heat, x="inc_anon", y="age_anon",
                         z="avg_loss_ratio", color_continuous_scale="Viridis",
                         labels={"age_anon":"Age Group", "inc_anon":"Income Group"}
                         #title="Average Loss Ratio by Age Ã— Income Group"
                         )
fig.update_layout(template="plotly_white", height=500,  )#  xaxis=dict(showticklabels=False),
                 #  yaxis=dict(showticklabels=False))

fig.update_coloraxes(

        colorbar=dict(
            title="",
            tickvals=[heat["avg_loss_ratio"].min(), heat['avg_loss_ratio'].max()],
            ticktext=["Low", "High"],
            ticks='outside',
            showticklabels=True)
)
fig.show()



In [0]:


cols = ['XXX', 'XXX', 'XXX', 'XXX']

TARGET  = "LossRatio"

overall_mean = geo_pd["LossRatio"].mean()

for CAT_COL in cols:
    agg = (df.groupBy(CAT_COL)
            .agg(F.count("*").alias("n"),
                F.mean(TARGET).alias("avg_loss"))
            .withColumn("pct", F.col("n") / F.sum("n").over(Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)))
            .orderBy(F.desc("n"))
            .toPandas())

    fig = px.scatter(
        agg, x="pct", y="avg_loss", size="n", color="avg_loss", size_max = 60,
        hover_data=[CAT_COL],
        color_continuous_scale='turbo',
        labels={"pct":"Segmentation Distribution","avg_loss":"Average Loss Ratio"}
    )
    fig.update_layout(

                    xaxis=dict(showticklabels=False),
                    yaxis=dict(showticklabels=False),
                    height=600,
                    width=1000
                    )


    fig.add_hline(y=overall_mean, line_color="black", line_dash="dash",
                annotation_text="Overall mean", annotation_position="top right")
    
    fig.update_coloraxes(

        colorbar=dict(
            title="",
            tickvals=[agg["avg_loss"].min(), agg['avg_loss'].max()],
            ticktext=["Low", "High"],
            ticks='outside',
            showticklabels=True)
)
    
    fig.show()
