In [27]:
import pickle
import pandas as pd
from surprise.model_selection import train_test_split
from surprise import SVD, accuracy, Dataset, Reader

df = pd.read_csv('data/cleaned_data.csv') 
df.head()

Unnamed: 0,EmployeeID,BenefitID,SatisfactionScore,Comments,UsageFrequency,LastUsedDate,BenefitType,BenefitSubType,BenefitCost,Gender_Female,...,BenefitFlag_Retirement_Plan_401k_High_Contribution,BenefitFlag_Retirement_Plan_401k_Investment_Fees,BenefitFlag_Retirement_Plan_401k_Maximum_Matching,BenefitFlag_Retirement_Plan_401k_Standard_Matching,BenefitFlag_Technology_Stipend_Monthly_Internet_Allowance,BenefitFlag_Tuition_Reimbursement_Graduate_Degree,BenefitFlag_Tuition_Reimbursement_Individual_Courses,BenefitFlag_Tuition_Reimbursement_Professional_Certification,BenefitFlag_Tuition_Reimbursement_Undergraduate_Degree,BenefitFlag_Wellness_Programs_Premium_Discount_Tier_1
0,2540,3,4,Sufficient time off provided.,3,2023-10-20,Commuter Benefits,Transit Subsidy,325.0,False,...,False,False,False,False,False,False,False,False,False,False
1,3495,25,2,Coverage is minimal.,6,2023-10-06,Life Insurance,Supplemental Standard,774.91,False,...,False,False,False,False,False,False,False,False,False,False
2,403,18,4,Good plan with decent returns.,0,2023-11-20,Retirement Plan,401k Investment Fees,743.01,False,...,False,True,False,False,False,False,False,False,False,False
3,403,18,4,Good plan with decent returns.,5,2023-12-14,Retirement Plan,401k Investment Fees,743.01,False,...,False,True,False,False,False,False,False,False,False,False
4,1982,28,1,Disappointing service.,5,2024-07-02,Life Insurance,Dependent Coverage,165.54,False,...,False,False,False,False,False,False,False,False,False,False


In [28]:
import pandas as pd
import numpy as np

# Prepare data for analysis
# Combine Gender_Female, Gender_Male, Gender_Non-Binary into single column
df['Gender'] = df.apply(
    lambda row: 'Female' if row['Gender_Female'] else ('Male' if row['Gender_Male'] else ('Non-Binary' if row['Gender_Non-Binary'] else 'Unknown')),
    axis=1)

# Convert one-hot encoded generation columns to a single 'age_group' column
def get_age_group(row):
    if row['Age_Gen_Boomer']:
        return '58+'
    elif row['Age_Gen_Gen_X']:
        return '42-57'
    elif row['Age_Gen_Millenial']:
        return '26-41'
    elif row['Age_Gen_Gen_Z']:
        return '18-25'
    else:
        return 'Unknown'
    
df['age_group'] = df.apply(get_age_group, axis=1)

# Convert one-hot encoded tenure group columns to a single 'tenure_group' column
def get_tenure_group(row):
    if row['TenureGroups_ >25_years']:
        return '>25'
    elif row['TenureGroups_16-25_years']:
        return '16-25'
    elif row['TenureGroups_5-15_years']:
        return '5-15'
    elif row['TenureGroups_<5_years']:
        return '<5'
    else:
        return 'Unknown'

df['tenure_group'] = df.apply(get_tenure_group, axis=1)

# Convert one-hot encoded department columns to a single 'Department' column
def get_department(row):
    if row['Department_Finance']:
        return 'Finance'
    elif row['Department_HR']:
        return 'HR'
    elif row['Department_IT']:
        return 'IT'
    elif row['Department_Marketing']:
        return 'Marketing'
    elif row['Department_Sales']:
        return 'Sales'
    else:
        return 'Unknown'

df['Department'] = df.apply(get_department, axis=1)

## Recommender System

In [29]:

df['CombinedScore'] = df['SatisfactionScore'] * df['UsageFrequency'].replace(0.0,1)

user_item_matrix = df.pivot_table(index='EmployeeID', 
                                  columns='BenefitID', 
                                  values='CombinedScore',
                                  fill_value=0)

user_item_matrix

BenefitID,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0.0,5.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
4999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Created a combined score using SatisfactionScore and UsageFrequency
df['CombinedScore'] = df['SatisfactionScore'] * df['UsageFrequency'].replace(0,1)

cols = ['EmployeeID', 'BenefitID', 'CombinedScore']
scores_df = df[cols].dropna()

reader = Reader(rating_scale=(scores_df['CombinedScore'].min(), scores_df['CombinedScore'].max()))
data = Dataset.load_from_df(scores_df[cols], reader=reader)
scores_df

Unnamed: 0,EmployeeID,BenefitID,CombinedScore
0,2540,3,12
1,3495,25,12
2,403,18,4
3,403,18,20
4,1982,28,5
...,...,...,...
9607,1122,25,6
9608,4809,22,20
9609,4603,3,14
9610,4603,3,6


In [31]:
X_train, X_test = train_test_split(data=data, test_size=0.2, random_state=42)

In [32]:
svd = SVD()
svd.fit(X_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14422e060>

In [33]:
preds = svd.test(X_test)
accuracy.rmse(predictions=preds)

RMSE: 11.3376


11.337599742833092

Make Recommendations

Evaluate offline

In [34]:
user_item_matrix.index

Index([   2,    3,    4,    5,    6,    8,   10,   12,   13,   15,
       ...
       4989, 4990, 4993, 4994, 4995, 4996, 4997, 4998, 4999, 5000],
      dtype='int64', name='EmployeeID', length=3951)

In [35]:
def obtain_all_predictions(svd: SVD, df) -> list:
    predictions = []
    benefits = df.columns.to_list()
    employees = df.index.to_list()
    for uid in employees:
        for iid in benefits:
            pred = svd.predict(uid, iid)
            predictions.append((pred[0], pred[1], pred[2], pred[3]))
    
    return predictions

predictions = obtain_all_predictions(svd=svd, df=user_item_matrix)

In [36]:
def get_top_n_benefits(predictions, n=10):
    top_n = {}
    for uid,iid, _, est in predictions:
        if uid not in top_n.keys():
            top_n[uid] = []
        top_n[uid].append((iid, est))

    for uid, user_score in top_n.items():
        user_score.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [iid for (iid, _) in user_score[:n]]
    
    return top_n

In [37]:
def metrics_at_k(top_n, real_data: list, k=10):

    # Get real scores (relevant_items has real scores)
    relevant_items = {}
    for uid, iid, true_score in real_data:
        if uid not in relevant_items.keys():
                relevant_items[uid] = []   
        if true_score > 0  and iid not in relevant_items[uid]:
            relevant_items[uid].append(iid)

    precisions = []
    recalls = []
    avg_precisions = []

    for uid, pred_benef in top_n.items():
        true_items = {}
        if uid in relevant_items.keys():
            for elem in relevant_items[uid]:
                if uid not in true_items.keys():
                    true_items[uid] = []
                true_items[uid].append(elem)

            # Obtain Precisions
            pred_k = pred_benef[:k]
            num_hits = len(set(pred_k) & set(true_items[uid]))
            precisions.append(num_hits/k)

            # Obtain Recalls
            len_true_items = 0.0001
            if len(true_items) > 0:
                len_true_items = len(true_items)
            
            recalls.append(num_hits/len_true_items)

            # Obtain Avg Precisions
            score = 0.0
            hits = 0
            for i, item in enumerate(pred_k):
                if item in true_items:
                    hits += 1
                    score += hits/(i+1.0)
            avg_precisions.append(score/min(len_true_items, k))

    metrics = {
        'Precision@K': round(sum(precisions)/len(precisions), 2),
        'Recall@K': round(sum(recalls)/len(recalls), 2),
        'MAP@K': round(sum(avg_precisions)/len(avg_precisions), 2)
    }

    return metrics

In [38]:
real_data = list(scores_df.itertuples(index=False, name=None))
top_n = get_top_n_benefits(predictions=predictions, n=10)
metrics_at_k(top_n=top_n, real_data=real_data, k=5)

{'Precision@K': 0.11, 'Recall@K': 0.56, 'MAP@K': 0.0}

Save Model

In [None]:
model_pkl_file = "model/svd_model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(svd, file)

Online Evaluation A/B Testing

In order to evaluate if the recommender system works correctly doing an online evaluation, we propose the following A/B test:
(We assume that the employees have a webpage were they can see their list of benefits and give their feedback)

- Hipothesis:
    - Null Hypothesis: The recommender system has no impact on the employee engagement.
    - Alternative Hypothesis: The recommender system increases employee engagement.
- Gruoups (Randomly assigned so demographical aspects do not affect the results):
    - Group A (Treatment): These employees will receive personalaized recommendations. 
    - Group B (Control): These employees will continue observing the whole set of benefits with no personalized recommendations.
- Metrics:
    - Check if Group A have a higher or lower engagement.
    - Check if Group A has a higher mean UsageFrequency and SatisfactionScore.
- Time Frame:
    - It depends on the time that employees discover or engage with recommendations.

Suggest benefits based on peer usage or metadata

In [None]:
def get_predictions_by_group(df: pd.DataFrame, filter_by: dict):
    for filter in filter_by.keys():   
        if filter not in df.columns: 
            print(f"Column {filter} not in dataset.")
            return ValueError
        else:
            if filter_by[filter] not in df[filter].unique():
                print(f"Column {filter} does not have value {filter_by[filter]}.")
                return ValueError
            df_filtered = df[df[filter] == filter_by[filter]]

    df_filtered['CombinedScore'] = df_filtered['SatisfactionScore'] * df_filtered['UsageFrequency'].replace(0.0,1)

    cols = ['EmployeeID', 'BenefitID', 'CombinedScore']
    scores_df = df_filtered[cols].dropna()

    reader = Reader(rating_scale=(scores_df['CombinedScore'].min(), scores_df['CombinedScore'].max()))
    data = Dataset.load_from_df(scores_df[cols], reader=reader)

    data_matrix = scores_df.pivot_table(index='EmployeeID', 
                                    columns='BenefitID', 
                                    values='CombinedScore',
                                    fill_value=0)


    X_train, X_test = train_test_split(data=data, test_size=0.2, random_state=42)
    svd = SVD()
    svd.fit(X_train)
    predictions = obtain_all_predictions(svd=svd, df=data_matrix)

    top_n = get_top_n_benefits(predictions=predictions, n=10)
    
    return top_n


group_predictions = get_predictions_by_group(df=df, filter_by={'age_group': "18-25"})
group_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['CombinedScore'] = df_filtered['SatisfactionScore'] * df_filtered['UsageFrequency'].replace(0.0,1)


{20: [2, 1, 18, 22, 26, 27, 25, 15, 14, 11],
 30: [26, 2, 7, 27, 18, 24, 12, 3, 23, 15],
 53: [22, 2, 1, 18, 15, 26, 14, 27, 25, 11],
 54: [11, 2, 22, 1, 27, 14, 18, 19, 26, 3],
 59: [18, 3, 27, 2, 9, 23, 15, 22, 25, 26],
 65: [4, 2, 1, 18, 12, 6, 21, 20, 7, 19],
 75: [22, 2, 18, 27, 26, 15, 3, 8, 1, 14],
 85: [2, 18, 1, 26, 22, 15, 12, 14, 3, 19],
 86: [1, 2, 23, 27, 15, 14, 25, 26, 22, 21],
 96: [1, 2, 11, 22, 18, 20, 27, 26, 14, 15],
 106: [10, 8, 26, 18, 7, 2, 24, 14, 30, 9],
 116: [20, 18, 14, 22, 19, 24, 9, 27, 1, 4],
 122: [2, 7, 1, 15, 22, 26, 25, 30, 18, 14],
 139: [25, 18, 26, 22, 2, 15, 3, 30, 20, 11],
 177: [18, 25, 26, 15, 20, 22, 30, 24, 11, 8],
 186: [16, 24, 2, 1, 7, 25, 27, 22, 23, 5],
 190: [2, 15, 1, 30, 11, 26, 25, 22, 18, 27],
 224: [2, 22, 21, 18, 26, 25, 8, 19, 15, 27],
 250: [2, 1, 26, 22, 12, 27, 18, 19, 14, 11],
 264: [2, 18, 1, 22, 26, 25, 21, 19, 20, 11],
 283: [2, 1, 18, 22, 26, 27, 25, 15, 14, 11],
 305: [2, 1, 7, 21, 23, 5, 19, 16, 9, 4],
 333: [2, 15, 26