In [2]:
#los necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go



In [3]:
pip install plotly


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



##2: Exploratory Data Analysis

###2.1: Utilization Analysis



In [None]:
# Load the datasets
usage_data = pd.read_csv('/Users/iexposit/Downloads/EventCourseFile-1753377040-WA3546-CAP-WA3546-CAP-2 (2)/LabFiles/benefits-optimization/solutions/assets/data/usage_data.csv')
employee_data = pd.read_csv('/Users/iexposit/Downloads/EventCourseFile-1753377040-WA3546-CAP-WA3546-CAP-2 (2)/LabFiles/benefits-optimization/solutions/assets/data/employee_data.csv')
benefits_data = pd.read_csv('/Users/iexposit/Downloads/EventCourseFile-1753377040-WA3546-CAP-WA3546-CAP-2 (2)/LabFiles/benefits-optimization/solutions/assets/data/benefits_data.csv')
feedback_data = pd.read_csv('/Users/iexposit/Downloads/EventCourseFile-1753377040-WA3546-CAP-WA3546-CAP-2 (2)/LabFiles/benefits-optimization/solutions/assets/data/feedback_data.csv')

# Function to profile a dataframe
def profile_df(df, df_name):
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    # Duplicates
    duplicates = df.duplicated().sum()
    
    # Outliers (IQR for numerical columns)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_count = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        outliers[col] = outlier_count
    
    # Summary table
    issues = pd.DataFrame({
        'Metric': ['Missing Values', 'Missing %', 'Duplicates', 'Outliers'],
    })
    for col in df.columns:
        issues[col] = [
            missing[col],
            missing_pct[col],
            '-' if col != 'Overall' else duplicates,  # Duplicates are overall
            outliers.get(col, '-')
        ]
    issues['Overall'] = [missing.sum(), missing_pct.mean(), duplicates, sum(outliers.values())]
    
    #print(f"Profiling for {df_name}:")
    #display(issues)
    
    return issues

# Profile each dataset
usage_issues = profile_df(usage_data, 'usage_data')
employee_issues = profile_df(employee_data, 'employee_data')
benefits_issues = profile_df(benefits_data, 'benefits_data')
feedback_issues = profile_df(feedback_data, 'feedback_data')

# Merge datasets (assuming common keys: EmployeeID and BenefitID)
merged = pd.merge(usage_data, employee_data, on='EmployeeID', how='inner')
merged = pd.merge(merged, benefits_data, on='BenefitID', how='inner')
merged = pd.merge(merged, feedback_data, on=['EmployeeID', 'BenefitID'], how='inner')

# Standardize datatypes
merged['LastUsedDate'] = pd.to_datetime(merged['LastUsedDate'], errors='coerce')  # Assuming column name
merged['Gender'] = merged['Gender'].astype('category')  # Assuming column
merged['Department'] = merged['Department'].astype('category')

# Feature engineering
merged['age_group'] = pd.cut(merged['Age'], bins=[0, 30, 45, np.inf], labels=['<30', '30-45', '>45'])
merged['tenure_group'] = pd.cut(merged['Tenure'], bins=[0, 5, 10, np.inf], labels=['<5', '5-10', '>10'])  # Assuming Tenure column
# Subcategory flags (one-hot for BenefitSubType)
subcat_dummies = pd.get_dummies(merged['BenefitSubType'], prefix='subcat')
merged = pd.concat([merged, subcat_dummies], axis=1)

# Handle missing values
merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)  # Median impute
merged.dropna(subset=['Comments'], inplace=True)  # Drop missing Comments

# Validate
merged = merged[merged['UsageFrequency'] >= 0]  # No negatives
# Add more validations as needed, e.g., merged = merged[(merged['SatisfactionScore'] >=1) & (merged['SatisfactionScore'] <=5)]


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [None]:
#Convert LastUsedDate to datetime

merged['LastUsedDate']=pd.to_datetime(merged['LastUsedDate'], errors='coerce')
merged['Month']=merged['LastUsedDate'].dt.to_period('M').astype(str)

#Recategorization of Engagement Level
def categorize_usage(freq):
    if freq>=7:
        return 'High'
    elif freq >=3:
        return 'Medium'
    else:
        return 'Low'

merged['EngagementLevel']=merged['UsageFrequency'].apply(categorize_usage)

#Usage Feequency per BenefitID
top_used=merged.groupby('BenefitID')['UsageFrequency'].sum().sort_values(ascending=False).reset_index()
fig1=px.bar(top_used,x='BenefitID',y='UsageFrequency', title='UsageFrequency per BenefitID')
fig1.show()


Usage across BenefitsIDs is generally balanced, with most ranging from 800 to 1100 uses. BenefitID4, ID21 and ID10 are top performers with more than 1200 uses respectively. BenefitID14 and ID18 have the lowest uptake with around 800 uses

In [22]:
#Monthly usage trend
monthly_usage=merged.groupby('Month')['UsageFrequency'].sum().reset_index()
fig2=px.line(monthly_usage,x='Month', y='UsageFrequency', title='Monthly Usage Trend')
fig2.show()

Over the past year, total monthly benefit use climbed from 2400 uses in August 2023 to a high of roughly 2900 in November, dipped to around 2,500 in january 2024, then rebounded and showed a secondary peak of about 3,00 in June before tapering off again in July. This suggests a modest seasonality with higher engagement in late fall and early summer, and a post-holiday lull at the start of each calendar year.

In [23]:
#Engagement Level Distribution
fig3=px.histogram(merged,x='EngagementLevel', title='Employee Engagement Levels')
fig3.show()

The vast majority of employees fall into the Low Engagement category with a smaller but still substantial group at Medium Engagement and the fewest in the High Engagement bracket. 

In [None]:
#Average Usage per Employee by Benefit Subtype
eng_sub=merged.groupby(['BenefitSubType','EngagementLevel'],as_index=False)['UsageFrequency'].sum()
fig4=px.bar(eng_sub,x='BenefitSubType',y='UsageFrequency',title='Avg Usage per Employee by Benefit Subtype')
fig4.show()

There is a moderate spread in usage across al benefits subtypes, with most falling bewteen roughly 900 to 1100 uses. The plot suggests that tuition assistance, commuting support and premium disounts are specially popular, whereas some child-care and basic retirement match options may need better incentive adjustments.

In [None]:
#Monthly Usage Heatmap for top 10 subTypes
usage_by_sub=(merged.groupby('BenefitSubType',as_index=False)['UsageFrequency'].sum().sort_values('UsageFrequency', ascending=False))
month_sub=(merged.groupby(['Month','BenefitSubType'],as_index=False)['UsageFrequency'].sum())
top10=usage_by_sub['BenefitSubType'].head(10).tolist()
hm_data=month_sub[month_sub['BenefitSubType'].isin(top10)]
fig5=px.density_heatmap(hm_data,x='Month',y='BenefitSubType',z='UsageFrequency',title='Monthly Usage Heatmap (Top 10 SubTypes)')
fig5.update_xaxes(dtick='M1', tickformat='%Y-%m')

Over the past year, "401k high Contribution" and "401k Investment Fees" spike sharply in December (year-end matching and investment deadlines), while "Undergraduate Degree", "Tier 3 Partners" and "PPO Premium Discount" show consistently high usage across most months. Mid year peaks are visible for "Supplemental Standard" and "Monthly Internet Allowance", whereas "Family Membership", "Dependent Coverage" and "Conference Attendance" maintain low levels without pronounced seasonal wings. This suggests that retirement and tuition related benefits have strong seasonal drivers, commuter and premium discount perks are in steady demand and some care oriented subtypes may need further promotion.