In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
clean = pd.read_csv('data/cleaned_data.csv') 
# group by employeeid and benefit id, take average satisfaction score, first comment, sum usagefrequency, take lastuseddate, take max benefit cost, other columns take the first value
agg_dict = {
    'SatisfactionScore': 'mean',
    'Comments': 'first',
    'UsageFrequency': 'sum',
    'LastUsedDate': 'last',
    'BenefitCost': 'max',
}
# For all other columns, take the first value
other_cols = [col for col in clean.columns if col not in ['EmployeeID','BenefitID'] + list(agg_dict.keys())]
for col in other_cols:
    agg_dict[col] = 'first'

merged = clean.groupby(['EmployeeID','BenefitID'], as_index=False).agg(agg_dict)


# Calculate cost-per-usage by BenefitID/BenefitSubType
merged['CostPerUsage'] = merged['BenefitCost'] / merged['UsageFrequency'].replace(0, np.nan)  # Avoid division by zero
merged['CostPerUsage'].fillna(0, inplace=True)  # Fill NaN with 0 for no usage  

# Develop an ROI score (normalize cost-per-usage and satisfaction score)

# Normalize cost-per-usage
max_cost = merged['CostPerUsage'].max()
min_cost = merged['CostPerUsage'].min()
merged['NormalizedCost'] = (merged['CostPerUsage'] - min_cost) / (max_cost - min_cost)

# Normalize satisfaction score
max_satisfaction = merged['SatisfactionScore'].max()
min_satisfaction = merged['SatisfactionScore'].min()
merged['NormalizedSatisfaction'] = (merged['SatisfactionScore'] - min_satisfaction) / (max_satisfaction - min_satisfaction)

# Calculate ROI score
# Those that have no cost or satisfaction will have NaN ROI, which we can handle later
merged['ROI_Score'] = merged['NormalizedSatisfaction'] / merged['NormalizedCost'].replace(0, np.nan)  # Avoid division by zero



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['CostPerUsage'].fillna(0, inplace=True)  # Fill NaN with 0 for no usage


In [3]:
# Identify underutilized high-cost subcategories
# Group by BenefitSubType and calculate mean cost-per-usage and usage frequency, satisfaction score, and rank ROI score
subcategory_stats = merged.groupby('BenefitSubType').agg({
    'CostPerUsage': 'mean',
    'UsageFrequency': 'mean',
    'SatisfactionScore': 'mean',
    'ROI_Score': 'mean'
}).reset_index()

# Define high-cost threshold as the 75th percentile of cost-per-usage
high_cost_threshold = subcategory_stats['CostPerUsage'].quantile(0.75)

# Define low-usage threshold as the 25th percentile of usage frequency
low_usage_threshold = subcategory_stats['UsageFrequency'].quantile(0.25)
roi_usage_treshold = subcategory_stats['ROI_Score'].quantile(0.25)

# Visualise and highlight those above/below thresholds
# Highlight subcategories above the high cost threshold and below the low usage threshold with colored bars

def create_bar_plot_treshold(df, x_col, y_col, title, treshold, treshold_label='max', highlight_above=None):
    # Mapping raw column names to pretty labels
    pretty_labels = {
        'CostPerUsage': 'Cost per Usage',
        'UsageFrequency': 'Usage Frequency',
        'ROI_Score': 'ROI Score'
    }

    # Determine base threshold logic
    if treshold_label == 'max':
        df['ThresholdFlag'] = np.where(df[y_col] > treshold, 'Above Threshold', 'Normal')
    else:
        df['ThresholdFlag'] = np.where(df[y_col] < treshold, 'Below Threshold', 'Normal')

    # Extra highlight for very high values
    if highlight_above is not None:
        df.loc[df[y_col] > highlight_above, 'ThresholdFlag'] = 'Outstanding'

    # Color map with optional extra category
    color_map = {
        'Above Threshold': 'red',
        'Below Threshold': 'red',
        'Normal': 'lightgray',
        'Outstanding': 'darkgoldenrod'
    }

    # Plot
    fig = px.bar(
        df,
        x=x_col,
        y=y_col,
        color='ThresholdFlag',
        color_discrete_map=color_map,
        title=title
    )

    fig.update_yaxes(title_text=pretty_labels.get(y_col, y_col.replace('_', ' ')))
    fig.update_layout(xaxis_tickangle=-90, width=1000, height=500, template='plotly_white')
    fig.add_hline(y=treshold, line_dash="dash", line_color="darkred")

    if highlight_above is not None:
        fig.add_hline(y=highlight_above, line_dash="dash", line_color="darkgoldenrod")

    return fig  # <-- Return figure instead of showing it



create_bar_plot_treshold(merged.groupby('BenefitSubType').agg({'CostPerUsage': 'mean'}).reset_index(), 'BenefitSubType', 'CostPerUsage', 'Average Cost Per Usage by Benefit Subtype', high_cost_threshold, 'max').show()
create_bar_plot_treshold(merged.groupby('BenefitSubType').agg({'UsageFrequency': 'mean'}).reset_index(), 'BenefitSubType', 'UsageFrequency', 'Average Usage Frequency by Benefit Subtype', low_usage_threshold, 'min').show()
create_bar_plot_treshold(merged.groupby('BenefitSubType').agg({'ROI_Score': 'mean'}).reset_index(), 'BenefitSubType', 'ROI_Score', 'Average ROI Score by Benefit Subtype', roi_usage_treshold, 'min',highlight_above=20).show()

# # Find those that have high cost and low usage
# underutilized_high_cost = subcategory_stats[
#     (subcategory_stats['CostPerUsage'] > high_cost_threshold) &
#     (subcategory_stats['UsageFrequency'] < low_usage_threshold)
# ]

# print("Underutilized High-Cost Subcategories:")
# print(underutilized_high_cost)

# # Create quadrant plots (cost vs. usage, cost vs. satisfaction)
# fig = px.scatter(subcategory_stats, x='CostPerUsage', y='UsageFrequency',
#                  color='BenefitSubType',
#                  title='Cost vs Usage Frequency by Benefit Subtype',
#                  labels={'CostPerUsage': 'Average Cost per Usage', 'UsageFrequency': 'Average Usage Frequency'},
#                  hover_data=['BenefitSubType'])
# fig.add_hline(y=low_usage_threshold, line_dash="dash", line_color="red", annotation_text="Low Usage Threshold")
# fig.add_vline(x=high_cost_threshold, line_dash="dash", line_color="blue", annotation_text="High Cost Threshold")
# fig.show()

# # Find those with have high cost and low satisfaction
# underutilized_high_cost_satisfaction = subcategory_stats[
#     (subcategory_stats['CostPerUsage'] > high_cost_threshold) &
#     (subcategory_stats['SatisfactionScore'] < subcategory_stats['SatisfactionScore'].quantile(0.25))
# ]

# print("Underutilized High-Cost Benefits with Low Satisfaction:")
# print(underutilized_high_cost_satisfaction[['BenefitSubType', 'CostPerUsage', 'SatisfactionScore']])

# # Visualise and highlight those above/below thresholds
# fig = px.scatter(subcategory_stats, x='CostPerUsage', y='SatisfactionScore',
#                  color='BenefitSubType',
#                  title='Cost vs SatisfactionScore by Benefit Subtype',
#                  labels={'CostPerUsage': 'Average Cost per Usage', 'SatisfactionScore': 'Average SatisfactionScore'},
#                  hover_data=['BenefitSubType'])
# fig.add_hline(y=subcategory_stats['SatisfactionScore'].quantile(0.25), line_dash="dash", line_color="red", annotation_text="Low Satisfaction Threshold")
# fig.add_vline(x=high_cost_threshold, line_dash="dash", line_color="blue", annotation_text="High Cost Threshold")
# fig.show()

In [4]:
# Rank benefits by cost efficiency
subcategory_stats['CostEfficiencyRank'] = (subcategory_stats['ROI_Score'].rank(ascending=False)).astype(int)

# Flag which benefits could be removed
# Define conditions for removal - high cost and low usage or low satisfaction
subcategory_stats['RemoveFlag'] = np.where(
    (subcategory_stats['CostPerUsage'] > high_cost_threshold) &
    ((subcategory_stats['UsageFrequency'] < low_usage_threshold) |
    (subcategory_stats['SatisfactionScore'] < subcategory_stats['SatisfactionScore'].quantile(0.25))),
    'Remove', 'Keep'
)
# Find BenefitType
benefit_types = merged[['BenefitType','BenefitSubType']].drop_duplicates().reset_index(drop=True)

# Merge subcategory stats with benefit types
category_stats = subcategory_stats.merge(
    benefit_types,
    on='BenefitSubType',
    how='left'
)

# Display categories that could be removed
removable_categories = category_stats[category_stats['RemoveFlag'] == 'Remove']

In [5]:
category_stats.head()

Unnamed: 0,BenefitSubType,CostPerUsage,UsageFrequency,SatisfactionScore,ROI_Score,CostEfficiencyRank,RemoveFlag,BenefitType
0,401k Basic Matching,212.098774,4.689076,2.878151,3.309877,25,Remove,Retirement Plan
1,401k Catch-Up Contributions,121.324838,4.139442,2.790837,4.937111,16,Keep,Retirement Plan
2,401k High Contribution,59.199464,5.003759,2.918546,11.126834,8,Keep,Retirement Plan
3,401k Investment Fees,151.009728,4.621951,3.085366,4.182699,20,Keep,Retirement Plan
4,401k Maximum Matching,192.073667,4.566524,2.962089,3.08364,26,Keep,Retirement Plan
