In [1]:
import pandas as pd
import numpy as np

merged = pd.read_csv('data/cleaned_data.csv') 

# Prepare data for analysis
# Combine Gender_Female, Gender_Male, Gender_Non-Binary into single column
merged['Gender'] = merged.apply(
    lambda row: 'Female' if row['Gender_Female'] else ('Male' if row['Gender_Male'] else ('Non-Binary' if row['Gender_Non-Binary'] else 'Unknown')),
    axis=1)

# Convert one-hot encoded generation columns to a single 'age_group' column
def get_age_group(row):
    if row['Age_Gen_Boomer']:
        return '58+'
    elif row['Age_Gen_Gen_X']:
        return '42-57'
    elif row['Age_Gen_Millenial']:
        return '26-41'
    elif row['Age_Gen_Gen_Z']:
        return '18-25'
    else:
        return 'Unknown'
    
merged['age_group'] = merged.apply(get_age_group, axis=1)

# Convert one-hot encoded tenure group columns to a single 'tenure_group' column
def get_tenure_group(row):
    if row['TenureGroups_ >25_years']:
        return '>25'
    elif row['TenureGroups_16-25_years']:
        return '16-25'
    elif row['TenureGroups_5-15_years']:
        return '5-15'
    elif row['TenureGroups_<5_years']:
        return '<5'
    else:
        return 'Unknown'

merged['tenure_group'] = merged.apply(get_tenure_group, axis=1)

# Convert one-hot encoded department columns to a single 'Department' column
def get_department(row):
    if row['Department_Finance']:
        return 'Finance'
    elif row['Department_HR']:
        return 'HR'
    elif row['Department_IT']:
        return 'IT'
    elif row['Department_Marketing']:
        return 'Marketing'
    elif row['Department_Sales']:
        return 'Sales'
    else:
        return 'Unknown'

merged['Department'] = merged.apply(get_department, axis=1)

## Recommender System

In [2]:
# Build a user-item matrix (EmployeeID vs BenefitID/BenefitSubType with UsageFrequency)
user_item_matrix = merged.pivot_table(
    index='EmployeeID',
    columns='BenefitSubType',
    values='UsageFrequency',
    fill_value=0
)


# Identify similar users based on their demographic and benefit usage patterns
# Instead of UsageFrequency we will use our own metrics that takes into consideration satisfaction as well

# Calculate the normalized satisfaction score
merged['NormalizedSatisfaction'] = merged['SatisfactionScore'] / merged['SatisfactionScore'].max()
# Calculate the normalized usage frequency across employees
merged['NormalizedUsageFrequency'] = merged.groupby('EmployeeID')['UsageFrequency'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

# Create a column composite rating that combines normalized satisfaction and usage frequency with different weights
alpha = 0.7  # Weight for normalized satisfaction
beta = 0.3   # Weight for usage frequency

merged['CompositeRating'] = (alpha * merged['NormalizedSatisfaction']) + (beta * merged['NormalizedUsageFrequency'])

# Create a user-item matrix with normalized satisfaction scores and normalizes usage frequency
user_item_matrix = merged.pivot_table(
    index='EmployeeID',
    columns='BenefitSubType',
    values='CompositeRating',
    fill_value=0
)
 
# Enrich the user-item matrix with demographic data
demographic_columns = ['Department', 'tenure_group', 'age_group', 'Gender']

# Enrich the user-item matrix with satisfaction data
satisfaction_columns = ['SatisfactionScore']

# Add demographic and satisfaction data to the user-item matrix
for col in demographic_columns + satisfaction_columns:
    user_item_matrix[col] = merged.groupby('EmployeeID')[col].first()


In [3]:
# Reset index to get EmployeeID back as a column
long_df = user_item_matrix.reset_index()

# Melt the benefit columns (filter out demographic/satisfaction)
benefit_cols = merged['BenefitSubType'].unique().tolist()

ratings_long = long_df.melt(
    id_vars='EmployeeID',
    value_vars=benefit_cols,
    var_name='BenefitSubType',
    value_name='CompositeRating'
)

# Optional: filter out zero ratings (i.e., never interacted)
ratings_long = ratings_long[ratings_long['CompositeRating'] > 0]


In [4]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Define reader
reader = Reader(rating_scale=(0, 1))

# Load dataset
data = Dataset.load_from_df(ratings_long[['EmployeeID', 'BenefitSubType', 'CompositeRating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train model
model = SVD()
model.fit(trainset)

# Evaluate (optional)
predictions = model.test(testset)
print("RMSE:", accuracy.rmse(predictions))


RMSE: 0.2489
RMSE: 0.24886237599594557


In [5]:
predictions

[Prediction(uid=4618, iid='On-Site Infant Care', r_ui=0.5599999999999999, est=0.5202472162312671, details={'was_impossible': False}),
 Prediction(uid=328, iid='Graduate Degree', r_ui=0.48, est=0.7612605793854452, details={'was_impossible': False}),
 Prediction(uid=1165, iid='Undergraduate Degree', r_ui=0.45999999999999996, est=0.7191635271196065, details={'was_impossible': False}),
 Prediction(uid=1285, iid='Monthly Communications', r_ui=0.8599999999999999, est=0.5899718291807511, details={'was_impossible': False}),
 Prediction(uid=384, iid='401k High Contribution', r_ui=0.27999999999999997, est=0.46851303624020846, details={'was_impossible': False}),
 Prediction(uid=962, iid='Professional Certification', r_ui=0.42, est=0.5316373224457537, details={'was_impossible': False}),
 Prediction(uid=3514, iid='Premium Discount Tier 1', r_ui=1.0, est=0.5524745759937182, details={'was_impossible': False}),
 Prediction(uid=2850, iid='401k Investment Fees', r_ui=1.0, est=0.6512880983201151, details

In [None]:

# Normalize the data
user_item_matrix = user_item_matrix.div(user_item_matrix.sum(axis=1), axis=0)
# Convert the user-item matrix to a DataFrame
user_item_matrix = user_item_matrix.reset_index()

# Fill NaN values with 0
user_item_matrix.fillna(0, inplace=True)

# Calculate the cosine similarity between users
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(user_item_matrix.iloc[:, 1:])
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_item_matrix['EmployeeID'], columns=user_item_matrix['EmployeeID'])

BenefitID,EmployeeID,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,2,0.0,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
1,3,0.0,0.0,0.0,0.0,0.181818,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.363636,0.000000,0.0,0.0,0.0,0.0,0.454545,0.0
2,4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0
3,5,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
4,6,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.266667,...,0.0,0.0,0.000000,0.333333,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3946,4996,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
3947,4997,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
3948,4998,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,1.000000,0.0
3949,4999,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0


In [None]:
user_item_matrix

# Count values in each column that are not zero and row has more than 1 column breater than zero
benefit_counts = user_item_matrix.astype(bool).sum(axis=0)  
# Count values in each row that are not zero
user_counts = user_item_matrix.astype(bool).sum(axis=1)
# Filter benefits that are used by at least 10 users
benefits_used_by_at_least_10_users = benefit_counts[benefit_counts >= 10].index
# Filter users that have used at least 5 benefits
users_with_at_least_3_benefits = user_counts[user_counts >= 3].index

In [13]:
benefit_counts

BenefitID
1     189
2     170
3     199
4     209
5     185
6     203
7     200
8     203
9     206
10    161
11    183
12    200
13    175
14    183
15    168
16    164
17    185
18    183
19    184
20    238
21    188
22    192
23    182
24    207
25    194
26    202
27    173
28    194
29    161
30    207
dtype: int64