In [1]:
#load necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go



In [2]:
pip install plotly


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



##2: Exploratory Data Analysis

###2.1: Utilization Analysis



In [21]:
# Load the datasets
usage_data = pd.read_csv('/Users/inigo/Downloads/usage_data.csv')
employee_data = pd.read_csv('/Users/inigo/Downloads/employee_data.csv')
benefits_data = pd.read_csv('/Users/inigo/Downloads/benefits_data.csv')
feedback_data = pd.read_csv('/Users/inigo/Downloads/feedback_data.csv')

# Function to profile a dataframe
def profile_df(df, df_name):
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    # Duplicates
    duplicates = df.duplicated().sum()
    
    # Outliers (IQR for numerical columns)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_count = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        outliers[col] = outlier_count
    
    # Summary table
    issues = pd.DataFrame({
        'Metric': ['Missing Values', 'Missing %', 'Duplicates', 'Outliers'],
    })
    for col in df.columns:
        issues[col] = [
            missing[col],
            missing_pct[col],
            '-' if col != 'Overall' else duplicates,  # Duplicates are overall
            outliers.get(col, '-')
        ]
    issues['Overall'] = [missing.sum(), missing_pct.mean(), duplicates, sum(outliers.values())]
    
    #print(f"Profiling for {df_name}:")
    #display(issues)
    
    return issues

# Profile each dataset
usage_issues = profile_df(usage_data, 'usage_data')
employee_issues = profile_df(employee_data, 'employee_data')
benefits_issues = profile_df(benefits_data, 'benefits_data')
feedback_issues = profile_df(feedback_data, 'feedback_data')

# Merge datasets (assuming common keys: EmployeeID and BenefitID)
merged = pd.merge(usage_data, employee_data, on='EmployeeID', how='inner')
merged = pd.merge(merged, benefits_data, on='BenefitID', how='inner')
merged = pd.merge(merged, feedback_data, on=['EmployeeID', 'BenefitID'], how='inner')

# Standardize datatypes
merged['LastUsedDate'] = pd.to_datetime(merged['LastUsedDate'], errors='coerce')  # Assuming column name
merged['Gender'] = merged['Gender'].astype('category')  # Assuming column
merged['Department'] = merged['Department'].astype('category')

# Feature engineering
merged['age_group'] = pd.cut(merged['Age'], bins=[0, 30, 45, np.inf], labels=['<30', '30-45', '>45'])
merged['tenure_group'] = pd.cut(merged['Tenure'], bins=[0, 5, 10, np.inf], labels=['<5', '5-10', '>10'])  # Assuming Tenure column
# Subcategory flags (one-hot for BenefitSubType)
subcat_dummies = pd.get_dummies(merged['BenefitSubType'], prefix='subcat')
merged = pd.concat([merged, subcat_dummies], axis=1)

# Handle missing values
merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)  # Median impute
merged.dropna(subset=['Comments'], inplace=True)  # Drop missing Comments

# Validate
merged = merged[merged['UsageFrequency'] >= 0]  # No negatives
# Add more validations as needed, e.g., merged = merged[(merged['SatisfactionScore'] >=1) & (merged['SatisfactionScore'] <=5)]


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [23]:
#Convert LastUsedDate to datetime

merged['LastUsedDate']=pd.to_datetime(merged['LastUsedDate'], errors='coerce')
merged['Month']=merged['LastUsedDate'].dt.to_period('M').astype(str)
merged['Month'] = merged['LastUsedDate'].dt.to_period('M').dt.to_timestamp()


#Recategorization of Engagement Level
def categorize_usage(freq):
    if freq>=7:
        return 'High'
    elif freq >=3:
        return 'Medium'
    else:
        return 'Low'

merged['EngagementLevel']=merged['UsageFrequency'].apply(categorize_usage)

In [None]:
#Section 1: Engagement Level Distribution
eng_counts = merged['EngagementLevel'].value_counts().sort_index().reset_index()
eng_counts.columns = ['EngagementLevel', 'Count']
fig1 = px.bar(
    eng_counts,
    x='EngagementLevel',
    y='Count',
    title='Employee Engagement Levels',
    category_orders={'EngagementLevel': ['Low', 'Medium', 'High']},
    labels={'Count': 'Number of Records'},
)
fig1.show()

In [None]:
#Section 2: Benefits by Usage
benefit_freq=merged.groupby('BenefitID')['UsageFrequency'].sum().sort_values(ascending=False).reset_index()
fig1=px.bar(benefit_freq,x='BenefitID',y='UsageFrequency', title='UsageFrequency per BenefitID')
fig1.show()

In [None]:
#Section 3: Usage by Benefit SubType
sub_totals = (
    merged.groupby('BenefitSubType')['UsageFrequency']
          .sum()
          .reset_index()
          .sort_values('UsageFrequency', ascending=False)
)
fig4 = px.bar(
    sub_totals, x='BenefitSubType', y='UsageFrequency',
    title='Usage by Benefit SubType',
    labels={'UsageFrequency':'Total Usage Frequency','BenefitSubType':'SubType'}
)
fig4.update_layout(xaxis_tickangle=-45)
fig4.show()


In [None]:
#Section 4: Monthly usage trend
monthly_usage=merged.groupby('Month')['UsageFrequency'].sum().reset_index()
fig2=px.line(monthly_usage,x='Month', y='UsageFrequency', title='Monthly Usage Trend')
fig2.show()

In [None]:
# Section 5. Department-level heatmap for 2024 usage
dept_month = (
    merged[merged['LastUsedDate'].dt.year == 2024]
          .groupby([merged['LastUsedDate'].dt.to_period('M'),'Department'])['UsageFrequency']
          .sum().reset_index()
)
dept_month['Month'] = dept_month['LastUsedDate'].dt.to_timestamp()
fig5 = px.density_heatmap(
    dept_month, x='Month', y='Department', z='UsageFrequency',
    title='Department-Level Monthly Usage Heatmap'
)
fig5.show()





In [None]:
#Section 6: Monthly Usage Heatmap for top 10 subTypes
usage_by_sub=(merged.groupby('BenefitSubType',as_index=False)['UsageFrequency'].sum().sort_values('UsageFrequency', ascending=False))
month_sub=(merged.groupby(['Month','BenefitSubType'],as_index=False)['UsageFrequency'].sum())
top10=usage_by_sub['BenefitSubType'].head(10).tolist()
hm_data=month_sub[month_sub['BenefitSubType'].isin(top10)]
fig5=px.density_heatmap(hm_data,x='Month',y='BenefitSubType',z='UsageFrequency',title='Monthly Usage Heatmap (Top 10 SubTypes)')
fig5.update_xaxes(dtick='M1', tickformat='%Y-%m')

Observations:

-Most employees fall into the Low engagement bucket (about 4,500), with far fewer in Medium (around 3,200) and High (about 1,900).

-Usage per BenefitID generally sits in the 900–1,400 range, with noticeable peaks at IDs 4, 20, and 26, plus a few that see under 1,000 uses.

-A few subtypes—especially 401k High Contribution and Undergraduate Degree—account for the bulk of activity, while family-care options lag behind.

-Tuition assistance, commuting support, and premium discounts are particularly popular, whereas some child-care programs and basic retirement matches may need stronger incentives.

-There’s a clear seasonal cycle: usage spikes in October–November and May–July, dips in January, and shows an additional peak around June.

-Departments behave differently: IT leads in usage, HR trails significantly, and Sales/Finance sit in the middle but dip during the summer months.

-The top subtypes display strong seasonal peaks (for example, 401k fees/contributions), while other popular subtypes maintain steadier, lower‐level usage throughout the year.

Recommendations:

-Help Low Users

Send simple “How to use your benefits” reminders and offer small rewards (like points or badges) to encourage more use.

-Focus on Winners

Find out why BenefitIDs 4, 20, and 26 are so popular and apply those best practices elsewhere. Survey employees on the least-used benefits and remove any hurdles.

-Boost Care Options

Roll out family-friendly campaigns—emails, posters, or webinars—to promote child-care benefits alongside your most-used programs.

-Plan by Season

Launch major benefit announcements 4–6 weeks before the October and May usage peaks. Run a “New Year, New Benefits” push in December to smooth out the January dip.

-Leverage High-Use Teams

Invite IT team members to share tips with HR and Sales. Create short, department-specific guides and encourage managers to highlight benefits in team meetings.

-Match Promotions to Demand

Schedule subtype-specific workshops when usage usually peaks. Run small mid-year challenges to keep momentum and cross-promote subtypes that stay steady throughout the year.