In [1]:
import pandas as pd
import numpy as np

# Load the datasets
usage_data = pd.read_csv('data/usage_data.csv')
employee_data = pd.read_csv('data/employee_data.csv')
benefits_data = pd.read_csv('data/benefits_data.csv')
feedback_data = pd.read_csv('data/feedback_data.csv')

# Function to profile a dataframe
def profile_df(df, df_name):
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    # Duplicates
    duplicates = df.duplicated().sum()
    
    # Outliers (IQR for numerical columns)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_count = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        outliers[col] = outlier_count
    
    # Summary table
    issues = pd.DataFrame({
        'Metric': ['Missing Values', 'Missing %', 'Duplicates', 'Outliers'],
    })
    for col in df.columns:
        issues[col] = [
            missing[col],
            missing_pct[col],
            '-' if col != 'Overall' else duplicates,  # Duplicates are overall
            outliers.get(col, '-')
        ]
    issues['Overall'] = [missing.sum(), missing_pct.mean(), duplicates, sum(outliers.values())]
    
    #print(f"Profiling for {df_name}:")
    #display(issues)
    
    return issues

# Profile each dataset
usage_issues = profile_df(usage_data, 'usage_data')
employee_issues = profile_df(employee_data, 'employee_data')
benefits_issues = profile_df(benefits_data, 'benefits_data')
feedback_issues = profile_df(feedback_data, 'feedback_data')

# Merge datasets (assuming common keys: EmployeeID and BenefitID)
merged = pd.merge(usage_data, employee_data, on='EmployeeID', how='inner')
merged = pd.merge(merged, benefits_data, on='BenefitID', how='inner')
merged = pd.merge(merged, feedback_data, on=['EmployeeID', 'BenefitID'], how='inner')

# Standardize datatypes
merged['LastUsedDate'] = pd.to_datetime(merged['LastUsedDate'], errors='coerce')  # Assuming column name
merged['Gender'] = merged['Gender'].astype('category')  # Assuming column
merged['Department'] = merged['Department'].astype('category')

# Feature engineering
merged['age_group'] = pd.cut(merged['Age'], bins=[0, 30, 45, np.inf], labels=['<30', '30-45', '>45'])
merged['tenure_group'] = pd.cut(merged['Tenure'], bins=[0, 5, 10, np.inf], labels=['<5', '5-10', '>10'])  # Assuming Tenure column
# Subcategory flags (one-hot for BenefitSubType)
subcat_dummies = pd.get_dummies(merged['BenefitSubType'], prefix='subcat')
merged = pd.concat([merged, subcat_dummies], axis=1)

# Handle missing values
merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)  # Median impute
merged.dropna(subset=['Comments'], inplace=True)  # Drop missing Comments

# Validate
merged = merged[merged['UsageFrequency'] >= 0]  # No negatives
# Add more validations as needed, e.g., merged = merged[(merged['SatisfactionScore'] >=1) & (merged['SatisfactionScore'] <=5)]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)  # Median impute


## Demographic Insights Discovery
This section analyzes benefit usage across demographic groups (age, gender, department, tenure), visualizes patterns, and highlights actionable insights.

In [75]:
# Demographic Analysis: Usage by Age, Gender, Department, Tenure
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px

# 1. Analyze benefit usage by demographics (Age, Gender, Department, Tenure)
# Group by demographic features and calculate usage statistics
demographics_usage = merged.groupby(['age_group', 'Gender', 'Department', 'tenure_group'])['UsageFrequency'].agg(['mean', 'median', 'std', 'count']).reset_index()
demographics_usage  # View summary statistics

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# categories = ['age_group', 'Gender', 'Department', 'tenure_group']
# fig = make_subplots(rows=2, cols=2, subplot_titles=categories)

# for i, cat in enumerate(categories):
#     row = i // 2 + 1
#     col = i % 2 + 1
#     fig.add_trace(
#         go.Box(
#             y=merged['UsageFrequency'],
#             x=merged[cat],
#             name=cat,
#             hovertemplate='UsageFrequency: %{y:.2f}<br>' + ': %{x}<extra></extra>'
#         ),
#         row=row, col=col
#     )

# fig.update_layout(
#     height=800,
#     width=1000,
#     title_text='UsageFrequency Distribution by Demographic Categories',
#     yaxis=dict(tickformat=".2f"),
#     yaxis2=dict(tickformat=".2f"),
#     yaxis3=dict(tickformat=".2f"),
#     yaxis4=dict(tickformat=".2f")
# )
# fig.show()

# # Compare subtypes across segments (e.g., by age_group, departments)
# def construct_barplot(df, groupby_list, x, color):
#     df_group = df.groupby(groupby_list)['UsageFrequency'].mean().reset_index()
#     return px.bar(df_group, x=x, y='UsageFrequency', color=color, barmode='stack', title=f'Usage by {x} and {color}', hover_data=groupby_list), df_group

# def compare_x_y_heatmap(df, groupby_list, x, y, color='Avg UsageFrequency'):
#     subtype_df = df.groupby(groupby_list)['UsageFrequency'].mean().unstack().reset_index()
#     return px.imshow(
#                 subtype_df.set_index(y),
#                 labels=dict(x=x, y=y, color="Avg UsageFrequency"),
#                 title=f'{x} Usage Heatmap by {y}',
#                 aspect="auto",
#             ).show(), subtype_df.set_index(y)

# Visualize usage by department and age group and gender
fig1 = px.bar(
    merged.groupby(['age_group', 'Gender', 'Department'])['UsageFrequency'].agg(['mean']).reset_index(),
    x='Department',
    y='mean',
    color='Gender',
    barmode='group',
    title='Mean Usage Frequency by Department and Gender',
    hover_data=['age_group'],
    color_discrete_sequence=px.colors.qualitative.Set2)
fig1.show()

# Visualize usage by gender and tenure group
fig2 = px.bar(
        merged.groupby(['tenure_group', 'Gender', 'Department'])['UsageFrequency'].agg(['mean']).reset_index(),
        x='Department', 
        y='mean', 
        color='tenure_group', 
        barmode='group', 
        title='Mean UsageFrequency by Department and Tenure Group', 
        hover_data=['Gender'],
        color_discrete_sequence=px.colors.qualitative.Set1)
fig2.show()

# Visualize usage by gender and tenure group
fig3 = px.bar(
    merged.groupby(['age_group', 'Department'])['UsageFrequency'].agg(['mean']).reset_index(),
    x='Department',
    y='mean',
    color='age_group',
    barmode='stack',
    title='Mean Usage Frequency by Department and Gender',
    color_discrete_sequence=px.colors.qualitative.Set1[::-1])
fig3.show()


In [77]:
# Barplot: Mean UsageFrequency by Department and Gender
import plotly.express as px
dept_gender_df = merged.groupby(['Department', 'Gender'])['UsageFrequency'].mean().reset_index()
fig = px.bar(
    dept_gender_df,
    x='Department',
    y='UsageFrequency',
    color='Gender',
    barmode='group',
    title='Mean UsageFrequency by Department and Gender',
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['Gender']
 )
fig.update_layout(
    yaxis=dict(title='Mean UsageFrequency', tickformat='.2f'),
    xaxis=dict(title='Department'),
    legend_title_text='Gender'
 )
fig.show()

In [114]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create vertically stacked subplots
fig = make_subplots(rows=2, cols=1, subplot_titles=['Age Group', 'Tenure Group'])

# Unique gender values (sorted for consistency)
genders = merged['Gender'].dropna().unique()

# Assign distinct colors to each gender
color_map = {
    'Female': 'orchid',
    'Male': 'royalblue',
    'Non-Binary': 'seagreen'
}

# Age group subplot (Row 1)
for gender in genders:
    subset = merged[(merged['age_group'].notnull()) & (merged['Gender'] == gender)]
    fig.add_trace(
        go.Box(
            y=subset['UsageFrequency'],
            x=subset['age_group'],
            name=f'Age Group - {gender}',
            marker_color=color_map.get(gender, 'gray'),
            boxmean=True,
            legendgroup=gender,
            showlegend=True,
            hovertemplate=f'UsageFrequency: %{{y:.2f}}<br>Age Group: %{{x}}<br>Gender: {gender}<extra></extra>',
            line=dict(width=2),
            marker=dict(line=dict(width=1, color='black')),
            whiskerwidth=1
        ),
        row=1, col=1
    )

# Tenure group subplot (Row 2)
for gender in genders:
    subset = merged[(merged['tenure_group'].notnull()) & (merged['Gender'] == gender)]
    fig.add_trace(
        go.Box(
            y=subset['UsageFrequency'],
            x=subset['tenure_group'],
            name=f'Tenure Group - {gender}',
            marker_color=color_map.get(gender, 'gray'),
            boxmean=True,
            legendgroup=gender,
            showlegend=False,  # already shown above
            hovertemplate=f'UsageFrequency: %{{y:.2f}}<br>Tenure Group: %{{x}}<br>Gender: {gender}<extra></extra>',
            line=dict(width=2),
            marker=dict(line=dict(width=1, color='black')),
            whiskerwidth=1
        ),
        row=2, col=1
    )

# Update layout
fig.update_layout(
    height=800,
    width=1000,
    title_text='UsageFrequency Distribution by Age and Tenure Group, Split by Gender',
    boxmode='group',
    legend_title='Gender',
    yaxis=dict(title='UsageFrequency', tickformat='.2f'),
    yaxis2=dict(title='UsageFrequency', tickformat='.2f')
)

fig.show()


In [116]:
# Heatmap: UsageFrequency by Demographic Feature and Benefit Subtype
import plotly.express as px
for feature in ['tenure_group', 'age_group', 'Gender', 'Department']:
    heatmap_data = merged.groupby([feature, 'BenefitSubType'])['UsageFrequency'].mean().unstack()
    fig = px.imshow(heatmap_data, labels=dict(x='BenefitSubType', y=feature, color='Avg UsageFrequency'), title=f'UsageFrequency Heatmap: {feature} vs BenefitSubType')
    fig.show()

In [19]:
# Summary statistics for Department vs BenefitSubType heatmap
heatmap_data = merged.groupby(['Department', 'BenefitSubType'])['UsageFrequency'].mean().unstack().fillna(0)
heatmap_data.describe()

BenefitSubType,401k Basic Matching,401k Catch-Up Contributions,401k High Contribution,401k Investment Fees,401k Maximum Matching,401k Standard Matching,After-School Care,Basic Coverage,Conference Attendance,Dependent Coverage,...,PPO Individual,Premium Discount Tier 1,Professional Certification,Supplemental High Amount,Supplemental Standard,Tier 1 Partners,Tier 2 Partners,Tier 3 Partners,Transit Subsidy,Undergraduate Degree
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,3.802199,3.204308,3.97118,3.653997,3.614769,3.081369,3.004986,3.468518,3.453138,3.42441,...,3.109349,3.076344,3.059401,3.381538,3.6923,3.128126,3.208571,3.338036,3.41634,3.328352
std,0.505311,0.138401,0.331229,0.506387,0.426115,0.214874,0.499994,0.515611,0.277127,0.468046,...,0.352049,0.400621,0.285236,0.524816,0.540832,0.454645,0.423604,0.610222,0.376209,0.361701
min,3.253731,3.014706,3.647727,2.864865,3.138889,2.85,2.636364,2.814815,3.138889,2.826087,...,2.568182,2.477612,2.746667,2.847222,3.241935,2.671642,2.529412,2.95082,2.905405,2.784091
25%,3.314286,3.144737,3.661972,3.447761,3.415094,2.867647,2.666667,3.242424,3.325,3.1125,...,2.959184,3.092105,2.868852,2.932203,3.292135,2.830769,3.076923,2.955556,3.296296,3.192308
50%,3.865385,3.230769,3.988764,3.851351,3.473684,3.115942,2.735294,3.520548,3.333333,3.52381,...,3.257143,3.09375,3.061224,3.346154,3.409091,2.984375,3.367347,3.168831,3.306667,3.367089
75%,4.23913,3.239437,4.121951,4.013699,3.792208,3.283784,3.177083,3.536232,3.635135,3.626866,...,3.307692,3.111111,3.129032,3.674419,4.054054,3.346154,3.508197,3.205882,3.74,3.623853
max,4.338462,3.391892,4.435484,4.092308,4.253968,3.289474,3.809524,4.228571,3.833333,4.032787,...,3.454545,3.607143,3.491228,4.107692,4.464286,3.807692,3.560976,4.409091,3.833333,3.674419


In [20]:
# Summary statistics for Gender vs BenefitSubType heatmap
heatmap_data = merged.groupby(['Gender', 'BenefitSubType'])['UsageFrequency'].mean().unstack().fillna(0)
heatmap_data.describe()

BenefitSubType,401k Basic Matching,401k Catch-Up Contributions,401k High Contribution,401k Investment Fees,401k Maximum Matching,401k Standard Matching,After-School Care,Basic Coverage,Conference Attendance,Dependent Coverage,...,PPO Individual,Premium Discount Tier 1,Professional Certification,Supplemental High Amount,Supplemental Standard,Tier 1 Partners,Tier 2 Partners,Tier 3 Partners,Transit Subsidy,Undergraduate Degree
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,4.16263,2.82578,4.595232,3.343735,3.988686,3.115821,2.890628,3.626821,2.899713,3.144708,...,3.299564,2.611713,2.885401,3.97111,3.386912,3.737316,2.649866,3.599679,2.671698,3.100401
std,0.754894,0.793411,1.231311,0.735661,0.662895,0.16372,0.292908,0.328539,1.069366,0.611716,...,0.263709,0.965612,0.447602,1.107748,0.50463,1.094078,1.136466,0.540463,1.27588,0.463733
min,3.534247,2.0,3.702857,2.5,3.284091,2.926829,2.615385,3.258242,1.666667,2.5,...,3.106383,1.5,2.6,3.310811,2.833333,3.070922,1.363636,3.151832,1.2,2.636364
25%,3.743944,2.447531,3.892849,3.090136,3.683029,3.066589,2.736705,3.495788,2.562984,2.858571,...,3.149345,2.296784,2.627465,3.331664,3.169753,3.105974,2.215653,3.299519,2.274419,2.868687
50%,3.953642,2.895062,4.08284,3.680272,4.081967,3.206349,2.858025,3.733333,3.459302,3.217143,...,3.192308,3.093567,2.65493,3.352518,3.506173,3.141026,3.067669,3.447205,3.348837,3.10101
75%,4.476821,3.23867,5.04142,3.765602,4.340984,3.210317,3.028249,3.811111,3.516237,3.467062,...,3.396154,3.16757,3.028102,4.301259,3.663701,4.070513,3.292981,3.823602,3.407547,3.33242
max,5.0,3.582278,6.0,3.850932,4.6,3.214286,3.198473,3.888889,3.573171,3.716981,...,3.6,3.241573,3.401274,5.25,3.821229,5.0,3.518293,4.2,3.466258,3.56383


In [123]:
merged.groupby(['Gender','Department'])['UsageFrequency'].mean().unstack()

Department,Finance,HR,IT,Marketing,Sales
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,3.343992,3.581766,3.332756,3.092391,3.262312
Male,3.308917,3.328221,3.478646,3.426261,3.391259
Non-Binary,3.192308,3.553191,2.902439,3.533333,2.217391


#### Observations
- Employees below 30 years have most frequent subcategories related to Retirement and Fitness (Either family membership gym plan, or gym plan)
- Employees aged 30-45 have most frequent subcategory after Retirement connected with Life Insurance.
- The most frequent usage of benefits across tenure groups is tenure between 5-10.
- HR department has the highest usage of benefits between all departments. The lowest usage has Marketing department. All age groups below 45 tend to use more benfits than those over 45.
-  Non-Binary employees have very disparse values across departments - in some depatrments like IT, they have a very high usage of benefits on average whereas in sale it is slightly below 3 benefits on average. Apart from HR department, there is a visible drop in usage for employees with tenure >10 years.
- HR departments have high benefit usage across genders, but female employees in Marketing and Sales show lower engagement.
- Certain benefit subtypes (e.g., Gym Membership, Retirement Plan) are favored by younger employees and HR, while Life Insurance is most popular among employees aged 30-45. 

### Actionable Recommendations
- Promote Retirement Plan and Gym Membership benefits to younger employees (<30).
- Target Life Insurance benefits to employees aged 30-45.
- Encourage benefit usage among tenure group 5-10 years, as they are most active.
- HR department shows highest benefit usage; consider sharing best practices with other departments.
- Non-Binary employees usage of benefits very differs across departments. Try some inclusive benefit options, and feedback to understand how to support them.
- Assess what HR department does differently to engage other departments
- Look at benefits that are barely used and promote those that are in high demand.