# Clean Data

Imports

In [79]:
import numpy as np
import pandas as pd

Load Datasets

In [80]:
def load_dataset(path: str, file_name: str) -> pd.DataFrame:
   return pd.read_csv(path+file_name)

In [81]:
path = 'data/'
employee_df = load_dataset(path=path, file_name='employee_data.csv')
benefits_df = load_dataset(path=path, file_name='benefits_data.csv')
feedback_df = load_dataset(path=path, file_name='feedback_data.csv')
usage_df = load_dataset(path=path, file_name='usage_data.csv')

In [82]:
employee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   EmployeeID  5000 non-null   int64 
 1   Age         5000 non-null   int64 
 2   Gender      5000 non-null   object
 3   Department  5000 non-null   object
 4   Tenure      5000 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 195.4+ KB


In [83]:
benefits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   BenefitID       30 non-null     int64  
 1   BenefitType     30 non-null     object 
 2   BenefitSubType  30 non-null     object 
 3   BenefitCost     30 non-null     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.1+ KB


In [84]:
feedback_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   EmployeeID         30000 non-null  int64 
 1   BenefitID          30000 non-null  int64 
 2   SatisfactionScore  30000 non-null  int64 
 3   Comments           30000 non-null  object
dtypes: int64(3), object(1)
memory usage: 937.6+ KB


In [85]:
usage_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   EmployeeID      50000 non-null  int64 
 1   BenefitID       50000 non-null  int64 
 2   UsageFrequency  50000 non-null  int64 
 3   LastUsedDate    50000 non-null  object
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [86]:
# Change Date Format from string to DateTime
usage_df['LastUsedDate'] = pd.to_datetime(usage_df['LastUsedDate'])

In [87]:
usage_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   EmployeeID      50000 non-null  int64         
 1   BenefitID       50000 non-null  int64         
 2   UsageFrequency  50000 non-null  int64         
 3   LastUsedDate    50000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 1.5 MB


Check for missing data

In [88]:
def check_missing_values(df: pd.DataFrame, df_name: str) -> None:
    
    print(f"\nCheck missing values for the {df_name} dataframe:")

    missing_values = df.isnull().sum()
    if missing_values.sum() == 0:
            print("The DataFrame has no null values.")
    else:
        for col in df.columns:
            if missing_values[col] != 0:
                 print(f"Column {col} has {missing_values[col].sum()} null values.")



In [89]:
check_missing_values(df=employee_df, df_name='employee')
check_missing_values(df=benefits_df, df_name='benefits')
check_missing_values(df=feedback_df, df_name='feedback')
check_missing_values(df=usage_df, df_name='usage')


Check missing values for the employee dataframe:
The DataFrame has no null values.

Check missing values for the benefits dataframe:
The DataFrame has no null values.

Check missing values for the feedback dataframe:
The DataFrame has no null values.

Check missing values for the usage dataframe:
The DataFrame has no null values.


Drop duplicates

In [90]:
def remove_duplicates(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
    init_len = len(df)
    df.drop_duplicates(inplace=True)
    end_len = len(df)
    
    if init_len-end_len == 0:
        print(f"The {df_name} dataframe has no dupliclates.")
    else:
        print(f"{init_len-end_len} rows were dropped from the {df_name} dataset.")

    return df

In [91]:
# Example duplicate: feedback_df[feedback_df['EmployeeID'] == 3771][feedback_df['BenefitID'] == 20]
employee_df = remove_duplicates(df=employee_df, df_name='employee')
benefits_df = remove_duplicates(df=benefits_df, df_name='benefits')
feedback_df = remove_duplicates(df=feedback_df, df_name='feedback')
usage_df = remove_duplicates(df=usage_df, df_name='usage')

The employee dataframe has no dupliclates.
The benefits dataframe has no dupliclates.
624 rows were dropped from the feedback dataset.
2 rows were dropped from the usage dataset.


Outliers

In [92]:
def get_outliers(df: pd.DataFrame, df_name: str, remove: bool = False) -> pd.DataFrame:

    print(f"\nCheck outliers for the {df_name} dataframe:")

    # Get numerical columns
    numerical_cols = df.select_dtypes(include=np.number).columns

    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR

        upper_array = np.where(df[col] >= upper)[0]
        lower_array = np.where(df[col] <= lower)[0]
        total_outliers = len(upper_array) + len(lower_array)

        print(f"Column {col} has {total_outliers} outliers.")

        if remove:
            df.drop(index=upper_array, inplace=True)
            df.drop(index=lower_array, inplace=True)

    return df

In [93]:
employee_df = get_outliers(df=employee_df, df_name='employee')
benefits_df = get_outliers(df=benefits_df, df_name='benefits')
feedback_df = get_outliers(df=feedback_df, df_name='feedback')
usage_df = get_outliers(df=usage_df, df_name='usage')


Check outliers for the employee dataframe:
Column EmployeeID has 0 outliers.
Column Age has 0 outliers.
Column Tenure has 21 outliers.

Check outliers for the benefits dataframe:
Column BenefitID has 0 outliers.
Column BenefitCost has 0 outliers.

Check outliers for the feedback dataframe:
Column EmployeeID has 0 outliers.
Column BenefitID has 0 outliers.
Column SatisfactionScore has 0 outliers.

Check outliers for the usage dataframe:
Column EmployeeID has 0 outliers.
Column BenefitID has 0 outliers.
Column UsageFrequency has 0 outliers.


In [94]:
index_outliers = [119, 355, 404, 609, 633, 885, 1607, 1771, 2099, 2820, 2861, 2863, 3009, 3582, 3592, 3790, 4297, 4471, 4772, 4862, 4960]

employee_df.iloc[index_outliers]

Unnamed: 0,EmployeeID,Age,Gender,Department,Tenure
119,120,62,Male,Finance,40
355,356,63,Male,Finance,40
404,405,62,Male,HR,40
609,610,62,Female,Marketing,40
633,634,60,Female,IT,40
885,886,58,Male,Finance,40
1607,1608,62,Male,IT,40
1771,1772,61,Male,HR,40
2099,2100,60,Male,HR,40
2820,2821,63,Male,Finance,40


Define validation rules

In [95]:
def get_unique_values(df: pd.DataFrame, df_name: str) -> None:   
    print(f"\n{df_name} dataframe: ") 
    for col in df.columns:
        if 'ID' not in col:
            print(f"\t- {col}: {np.sort(df[col].unique().tolist())}")


In [96]:
get_unique_values(df=employee_df, df_name='employee')


employee dataframe: 
	- Age: [22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65]
	- Gender: ['Female' 'Male' 'Non-Binary']
	- Department: ['Finance' 'HR' 'IT' 'Marketing' 'Sales']
	- Tenure: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40]


In [97]:
get_unique_values(df=benefits_df, df_name='benefits')


benefits dataframe: 
	- BenefitType: ['Cell Phone Allowance' 'Childcare' 'Commuter Benefits'
 'Flexible Spending Account' 'Gym Membership' 'Health Insurance'
 'Life Insurance' 'Professional Development' 'Retirement Plan'
 'Technology Stipend' 'Tuition Reimbursement' 'Wellness Programs']
	- BenefitSubType: ['401k Basic Matching' '401k Catch-Up Contributions'
 '401k High Contribution' '401k Investment Fees' '401k Maximum Matching'
 '401k Standard Matching' 'After-School Care' 'Basic Coverage'
 'Conference Attendance' 'Dependent Coverage' 'Family Membership'
 'Graduate Degree' 'HDHP Individual' 'HMO Family' 'Healthcare FSA'
 'Individual Courses' 'Monthly Communications'
 'Monthly Internet Allowance' 'On-Site Infant Care' 'PPO Family'
 'PPO Individual' 'Premium Discount Tier 1' 'Professional Certification'
 'Supplemental High Amount' 'Supplemental Standard' 'Tier 1 Partners'
 'Tier 2 Partners' 'Tier 3 Partners' 'Transit Subsidy'
 'Undergraduate Degree']
	- BenefitCost: [ 65.    73.36  75.

In [98]:
get_unique_values(df=feedback_df, df_name='feedback')


feedback dataframe: 
	- SatisfactionScore: [1 2 3 4 5]
	- Comments: ['Average amount of vacation days.' 'Average, but necessary.'
 'Barely any time off given.' 'Complicated reimbursement process.'
 'Could use more vacation time.' 'Coverage is minimal.'
 'Difficult to find suitable care.' 'Disappointing service.'
 'Excellent coverage and affordable.' 'Fantastic benefit! I use it daily.'
 'Generous vacation days, very happy.' 'Good plan with decent returns.'
 'Good quality care available.' 'Good value, I enjoy the variety.'
 'Good, but sometimes difficult.' 'Great coverage, but a bit expensive.'
 'Great for further studies.' 'Great peace of mind for my family.'
 'Helps a bit with childcare costs.' 'Helps but limited in scope.'
 'Inconvenient and inefficient.' 'Incredible support for education.'
 "It's okay, I use it occasionally." 'Limited availability.'
 'Membership is not very useful.' 'Not happy with the coverage options.'
 'Not many locations available.' 'Not much flexibility offere

In [99]:
get_unique_values(df=usage_df, df_name='usage')


usage dataframe: 
	- UsageFrequency: [ 0  1  2  3  4  5  6  7  8  9 10]
	- LastUsedDate: [Timestamp('2023-08-05 00:00:00') Timestamp('2023-08-06 00:00:00')
 Timestamp('2023-08-07 00:00:00') Timestamp('2023-08-08 00:00:00')
 Timestamp('2023-08-09 00:00:00') Timestamp('2023-08-10 00:00:00')
 Timestamp('2023-08-11 00:00:00') Timestamp('2023-08-12 00:00:00')
 Timestamp('2023-08-13 00:00:00') Timestamp('2023-08-14 00:00:00')
 Timestamp('2023-08-15 00:00:00') Timestamp('2023-08-16 00:00:00')
 Timestamp('2023-08-17 00:00:00') Timestamp('2023-08-18 00:00:00')
 Timestamp('2023-08-19 00:00:00') Timestamp('2023-08-20 00:00:00')
 Timestamp('2023-08-21 00:00:00') Timestamp('2023-08-22 00:00:00')
 Timestamp('2023-08-23 00:00:00') Timestamp('2023-08-24 00:00:00')
 Timestamp('2023-08-25 00:00:00') Timestamp('2023-08-26 00:00:00')
 Timestamp('2023-08-27 00:00:00') Timestamp('2023-08-28 00:00:00')
 Timestamp('2023-08-29 00:00:00') Timestamp('2023-08-30 00:00:00')
 Timestamp('2023-08-31 00:00:00') Times

In [100]:
# Check if Age-Tenure is valid

employee_df[(employee_df['Age'] - employee_df['Tenure']) < 16] # All the values are valid

Unnamed: 0,EmployeeID,Age,Gender,Department,Tenure


In [101]:
print(f"Min Date: {np.min(usage_df['LastUsedDate'])}")
print(f"Max Date: {np.max(usage_df['LastUsedDate'])}")

Min Date: 2023-08-05 00:00:00
Max Date: 2024-08-04 00:00:00


## SUMMARY

- The column types were all correct except for the column 'LastUsedDate' in the usage dataset, where we had to convert it from string/object to DateTime.
- For these datasets we did not have to use any technique for imputing values for missing data, as there where no missing data. 
- The employee and the benefits dataframes did not have any duplicates. In the other hand, we removed duplicates from the feedback and usage dataframes.
- The only column we found outliers was 'Tenure' from the employee dataframe, as they are the the few employees that have been working in the company for 40 years. In this case, we have decided not to remove these outliers.

# Data Integration and Preparation

Merge datasets

In [102]:
merged_df = feedback_df.merge(usage_df, on=['EmployeeID', 'BenefitID'], how='inner')
merged_df = merged_df.merge(employee_df, on='EmployeeID', how='left').merge(benefits_df, on='BenefitID', how='left')
merged_df.sample(10)

Unnamed: 0,EmployeeID,BenefitID,SatisfactionScore,Comments,UsageFrequency,LastUsedDate,Age,Gender,Department,Tenure,BenefitType,BenefitSubType,BenefitCost
7689,796,5,1,Inconvenient and inefficient.,5,2024-06-11,31,Male,HR,12,Technology Stipend,Monthly Internet Allowance,75.0
9498,3398,15,3,Helps but limited in scope.,0,2023-10-31,23,Female,IT,2,Tuition Reimbursement,Professional Certification,359.21
2874,4728,21,3,"Average, but necessary.",5,2024-03-01,28,Female,Marketing,1,Life Insurance,Basic Coverage,163.92
7010,2108,25,4,"Solid plan, happy with it.",0,2024-02-09,49,Female,HR,29,Life Insurance,Supplemental Standard,774.91
6104,237,26,1,Membership is not very useful.,6,2024-01-16,48,Male,Sales,2,Gym Membership,Family Membership,519.66
455,2979,4,5,Top-notch retirement plan.,8,2023-09-20,27,Non-Binary,Marketing,3,Retirement Plan,401k High Contribution,261.44
2981,3977,20,3,Helps but limited in scope.,2,2024-07-08,22,Female,Sales,2,Tuition Reimbursement,Undergraduate Degree,489.96
7980,528,8,2,Could use more vacation time.,1,2024-05-16,27,Female,Marketing,4,Wellness Programs,Premium Discount Tier 1,125.0
6137,2878,1,3,"Standard, nothing exceptional.",9,2023-12-01,33,Male,IT,5,Retirement Plan,401k Basic Matching,876.21
561,3681,12,2,Complicated reimbursement process.,5,2023-12-17,39,Male,Marketing,18,Tuition Reimbursement,Graduate Degree,824.53


Handle missing values

In [103]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9612 entries, 0 to 9611
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   EmployeeID         9612 non-null   int64         
 1   BenefitID          9612 non-null   int64         
 2   SatisfactionScore  9612 non-null   int64         
 3   Comments           9612 non-null   object        
 4   UsageFrequency     9612 non-null   int64         
 5   LastUsedDate       9612 non-null   datetime64[ns]
 6   Age                9612 non-null   int64         
 7   Gender             9612 non-null   object        
 8   Department         9612 non-null   object        
 9   Tenure             9612 non-null   int64         
 10  BenefitType        9612 non-null   object        
 11  BenefitSubType     9612 non-null   object        
 12  BenefitCost        9612 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(6), object(5)
memory us

In [104]:
## NOT NECESSARY for this dataset
def input_values_median(df: pd.DataFrame, column_name: str, columun_partition: str) -> pd.DataFrame:
    if column_name not in df.columns:
        return NameError

    median_values = {}
    for val in list(df[columun_partition].unique()):
        median_values[val] = df[df[columun_partition] == val][column_name].median()

    df[column_name] = df[column_name].fillna(df[columun_partition].map(median_values))

    return df

In [105]:
merged_df = input_values_median(df=merged_df, column_name='SatisfactionScore', columun_partition='BenefitID')
merged_df['SatisfactionScore'] = merged_df['SatisfactionScore'].astype('int')

In [106]:
merged_df = input_values_median(df=merged_df, column_name='UsageFrequency', columun_partition='BenefitID')
merged_df['UsageFrequency'] = merged_df['UsageFrequency'].astype('int')

In [107]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9612 entries, 0 to 9611
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   EmployeeID         9612 non-null   int64         
 1   BenefitID          9612 non-null   int64         
 2   SatisfactionScore  9612 non-null   int64         
 3   Comments           9612 non-null   object        
 4   UsageFrequency     9612 non-null   int64         
 5   LastUsedDate       9612 non-null   datetime64[ns]
 6   Age                9612 non-null   int64         
 7   Gender             9612 non-null   object        
 8   Department         9612 non-null   object        
 9   Tenure             9612 non-null   int64         
 10  BenefitType        9612 non-null   object        
 11  BenefitSubType     9612 non-null   object        
 12  BenefitCost        9612 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(6), object(5)
memory us

Create derived fields

In [108]:
# Ages are between 22 - 6
def age_to_gen(age: int) -> str:
    if age >= 18 and age <= 25:
        return "Gen_Z"
    elif age >= 26 and age <= 41:
        return "Millenial"
    elif age >= 42 and age <= 57:
        return "Gen_X"
    elif age >= 58:
        return "Boomer"

In [109]:
merged_df['Age_Gen'] = merged_df['Age'].apply(age_to_gen)
merged_df = merged_df.drop(['Age'], axis=1)

In [110]:
def tenure_to_category(tenure: int) -> str:
    if tenure < 5:
        return "<5_years"
    elif tenure >= 5 and tenure <= 15:
        return "5-15_years"
    elif tenure >= 16 and tenure <= 25:
        return "16-25_years"
    elif tenure > 25:
        return " >25_years"

In [111]:
merged_df['TenureGroups'] = merged_df['Tenure'].apply(tenure_to_category)
merged_df = merged_df.drop(['Tenure'], axis=1)

Standardize data types

In [112]:
merged_df['BenefitFlag'] = merged_df['BenefitType'] + ' ' + merged_df['BenefitSubType']
merged_df['BenefitFlag'] = merged_df['BenefitFlag'].str.replace(' ', '_')

#merged_df = merged_df.drop(['BenefitType', 'BenefitSubType'], axis=1)

In [113]:
# cols to apply one hot encoding
cols = ['Gender', 'Department', 'Age_Gen', 'TenureGroups', 'BenefitFlag']
merged_df = pd.get_dummies(merged_df, columns=cols)

merged_df.sample(5)


Unnamed: 0,EmployeeID,BenefitID,SatisfactionScore,Comments,UsageFrequency,LastUsedDate,BenefitType,BenefitSubType,BenefitCost,Gender_Female,...,BenefitFlag_Retirement_Plan_401k_High_Contribution,BenefitFlag_Retirement_Plan_401k_Investment_Fees,BenefitFlag_Retirement_Plan_401k_Maximum_Matching,BenefitFlag_Retirement_Plan_401k_Standard_Matching,BenefitFlag_Technology_Stipend_Monthly_Internet_Allowance,BenefitFlag_Tuition_Reimbursement_Graduate_Degree,BenefitFlag_Tuition_Reimbursement_Individual_Courses,BenefitFlag_Tuition_Reimbursement_Professional_Certification,BenefitFlag_Tuition_Reimbursement_Undergraduate_Degree,BenefitFlag_Wellness_Programs_Premium_Discount_Tier_1
7497,3249,2,3,"Satisfactory, but could improve.",0,2023-08-07,Health Insurance,PPO Individual,706.93,False,...,False,False,False,False,False,False,False,False,False,False
8750,4313,21,3,"Average, but necessary.",0,2024-04-18,Life Insurance,Basic Coverage,163.92,True,...,False,False,False,False,False,False,False,False,False,False
9511,2900,14,5,Fantastic benefit! I use it daily.,0,2024-04-28,Gym Membership,Tier 1 Partners,73.36,True,...,False,False,False,False,False,False,False,False,False,False
5516,2785,1,5,Top-notch retirement plan.,6,2023-10-30,Retirement Plan,401k Basic Matching,876.21,True,...,False,False,False,False,False,False,False,False,False,False
2231,507,16,1,"Terrible service, would not recommend.",8,2023-11-20,Health Insurance,HDHP Individual,84.55,True,...,False,False,False,False,False,False,False,False,False,False


In [114]:
def dataset_validation(df: pd.DataFrame):
    # Check that SatisfactionScore are between 1-5        
    aux_df = df[(df['SatisfactionScore'] < 1) | (df['SatisfactionScore'] > 5)]
    if len(aux_df) > 0:
        print("SatisfactionScore: Validation test not passed.")
    else:
        print("SatisfactionScore: Validation test passed.")

    aux_df = df[(df['UsageFrequency'] < 0) | (df['UsageFrequency'] > 10)]
    if len(aux_df) > 0:
        print("UsageFrequency: Validation test not passed.")
    else:
        print("UsageFrequency: Validation test passed.")

    aux_df = df[(df['LastUsedDate'].dt.year < 2023) | (df['LastUsedDate'].dt.year > 2024)]
    if len(aux_df) > 0:
        print("LastUsedDate: Validation test not passed.")
    else:
        print("LastUsedDate: Validation test passed.")

dataset_validation(df=merged_df)

SatisfactionScore: Validation test passed.
UsageFrequency: Validation test passed.
LastUsedDate: Validation test passed.


Save the cleaned and prepared dataset

In [115]:
file_name = path+'cleaned_data.csv'
merged_df.to_csv(path_or_buf=file_name, index=False)