In [1]:
import pandas as pd
import numpy as np

# Load the datasets
usage_data = pd.read_csv('data/usage_data.csv')
employee_data = pd.read_csv('data/employee_data.csv')
benefits_data = pd.read_csv('data/benefits_data.csv')
feedback_data = pd.read_csv('data/feedback_data.csv')

# Function to profile a dataframe
def profile_df(df, df_name):
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    # Duplicates
    duplicates = df.duplicated().sum()
    
    # Outliers (IQR for numerical columns)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_count = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        outliers[col] = outlier_count
    
    # Summary table
    issues = pd.DataFrame({
        'Metric': ['Missing Values', 'Missing %', 'Duplicates', 'Outliers'],
    })
    for col in df.columns:
        issues[col] = [
            missing[col],
            missing_pct[col],
            '-' if col != 'Overall' else duplicates,  # Duplicates are overall
            outliers.get(col, '-')
        ]
    issues['Overall'] = [missing.sum(), missing_pct.mean(), duplicates, sum(outliers.values())]
    
    #print(f"Profiling for {df_name}:")
    #display(issues)
    
    return issues

# Profile each dataset
usage_issues = profile_df(usage_data, 'usage_data')
employee_issues = profile_df(employee_data, 'employee_data')
benefits_issues = profile_df(benefits_data, 'benefits_data')
feedback_issues = profile_df(feedback_data, 'feedback_data')

# Merge datasets (assuming common keys: EmployeeID and BenefitID)
merged = pd.merge(usage_data, employee_data, on='EmployeeID', how='inner')
merged = pd.merge(merged, benefits_data, on='BenefitID', how='inner')
merged = pd.merge(merged, feedback_data, on=['EmployeeID', 'BenefitID'], how='inner')

# Standardize datatypes
merged['LastUsedDate'] = pd.to_datetime(merged['LastUsedDate'], errors='coerce')  # Assuming column name
merged['Gender'] = merged['Gender'].astype('category')  # Assuming column
merged['Department'] = merged['Department'].astype('category')

# Feature engineering
merged['age_group'] = pd.cut(merged['Age'], bins=[0, 30, 45, np.inf], labels=['<30', '30-45', '>45'])
merged['tenure_group'] = pd.cut(merged['Tenure'], bins=[0, 5, 10, np.inf], labels=['<5', '5-10', '>10'])  # Assuming Tenure column
# Subcategory flags (one-hot for BenefitSubType)
subcat_dummies = pd.get_dummies(merged['BenefitSubType'], prefix='subcat')
merged = pd.concat([merged, subcat_dummies], axis=1)

# Handle missing values
merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)  # Median impute
merged.dropna(subset=['Comments'], inplace=True)  # Drop missing Comments

# Validate
merged = merged[merged['UsageFrequency'] >= 0]  # No negatives
# Add more validations as needed, e.g., merged = merged[(merged['SatisfactionScore'] >=1) & (merged['SatisfactionScore'] <=5)]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)  # Median impute


## Demographic Insights Discovery

In [51]:
import warnings
warnings.filterwarnings("ignore")

import plotly.express as px

# 1. Analyze benefit usage by demographics ( Age, Gender , Department, Tenure )
# We will group the data by given variables and compare their usagefrequencies
demographics_usage = merged.groupby(['age_group', 'Gender', 'Department', 'tenure_group'])['UsageFrequency'].mean().reset_index()

# 2. Identify preferences by BenefitSubType (e.g., IT’s preference for “Technology Stipend”).

# Compare subtypes across segments (e.g., by age_group, departments)
def construct_barplot(df, groupby_list, x, color):
    df_group = df.groupby(groupby_list)['UsageFrequency'].mean().reset_index()
    return px.bar(df_group, x=x, y='UsageFrequency', color=color, barmode='stack', title=f'Usage by {x} and {color}', hover_data=groupby_list), df_group

def compare_x_y_heatmap(df, groupby_list, x, y, color='Avg UsageFrequency'):
    subtype_df = df.groupby(groupby_list)['UsageFrequency'].mean().unstack().reset_index()
    return px.imshow(
                subtype_df.set_index(y),
                labels=dict(x=x, y=y, color="Avg UsageFrequency"),
                title=f'{x} Usage Heatmap by {y}',
                aspect="auto",
            ).show(), subtype_df.set_index(y)

In [83]:
#display(construct_barplot(merged, ['age_group', 'Gender', 'tenure_group'], 'age_group', 'Gender')[0])
construct_barplot(merged, ['age_group', 'Gender', 'Department'], 'Department', 'age_group')[1].sort_values('UsageFrequency')
#display(compare_x_y_heatmap(merged, ['age_group', 'Gender'], 'Gender', 'age_group')[0])

Unnamed: 0,age_group,Gender,Department,UsageFrequency
44,>45,Non-Binary,Sales,1.4
14,<30,Non-Binary,Sales,1.5
42,>45,Non-Binary,IT,1.823529
25,30-45,Non-Binary,Finance,2.478261
29,30-45,Non-Binary,Sales,2.714286
13,<30,Non-Binary,Marketing,2.75
40,>45,Non-Binary,Finance,2.777778
3,<30,Female,Marketing,2.952941
18,30-45,Female,Marketing,3.00974
4,<30,Female,Sales,3.036437


In [68]:
list_features = ['tenure_group', 'age_group', 'Gender', 'Department']
list_tables = []

for i in list_features:
    list_tables.append(compare_x_y_heatmap(merged, [i, 'BenefitSubType'], 'Benefit SubType', i)[1])


In [69]:
list_tables[3].T.describe()

Department,Finance,HR,IT,Marketing,Sales
count,30.0,30.0,30.0,30.0,30.0
mean,3.321507,3.442244,3.404948,3.271769,3.313496
std,0.42632,0.520321,0.398959,0.455593,0.467437
min,2.477612,2.529412,2.847222,2.126984,2.431034
25%,3.08234,3.141026,3.136995,2.92893,2.954082
50%,3.307179,3.335577,3.363338,3.274526,3.30188
75%,3.497807,3.792143,3.619676,3.586264,3.627213
max,4.12,4.464286,4.338462,4.253968,4.435484


In [67]:
list_tables[2].T.describe()

Gender,Female,Male,Non-Binary
count,30.0,30.0,30.0
mean,3.308746,3.407238,3.388661
std,0.37,0.281758,1.924253
min,2.65493,2.792683,0.75
25%,3.057734,3.197645,2.071429
50%,3.316464,3.457263,2.833333
75%,3.571387,3.556434,4.35
max,4.08284,4.081967,8.5


In [None]:
# 4. Document differences (e.g., Gen Z’s preference for “Gym Membership”).
patterns = """
- Based on specific age, younger on average are interested in subcategories related to Retirement Plan and Gym membership.
                        - age between 30-45 are most interested with Life Insurance	
                        -  
                        - younger have the most usage of benefits
- When it comes to tenure groups, tenure between 5-10 seems to be most active with usage of benefits
- Department that uses the most benefits is HR
- Low usage in 'Engineering' for 'Wellness' – potential for targeted promotion.
"""
print(patterns)



- Younger employees (<30) show higher usage in 'Fitness' subtypes.
- Females in 'HR' department have 1.5x engagement vs. males.
- Long-tenure (>10 years) prefer 'Retirement' benefits.
- Low usage in 'Engineering' for 'Wellness' – potential for targeted promotion.



In [72]:
benefits_data

Unnamed: 0,BenefitID,BenefitType,BenefitSubType,BenefitCost
0,1,Retirement Plan,401k Basic Matching,876.21
1,2,Health Insurance,PPO Individual,706.93
2,3,Commuter Benefits,Transit Subsidy,325.0
3,4,Retirement Plan,401k High Contribution,261.44
4,5,Technology Stipend,Monthly Internet Allowance,75.0
5,6,Retirement Plan,401k Standard Matching,598.44
6,7,Flexible Spending Account,Healthcare FSA,450.0
7,8,Wellness Programs,Premium Discount Tier 1,125.0
8,9,Professional Development,Conference Attendance,850.0
9,10,Childcare,On-Site Infant Care,915.48
