    Transform the Data

In [6]:

import pandas as pd

users_cleaned_df = pd.read_csv('../02Prep_Layer/users_cleaned.csv')
domains_cleaned_df = pd.read_csv('../02Prep_Layer/domains_cleaned.csv')
trainings_cleaned_df = pd.read_csv('../02Prep_Layer/trainings_cleaned.csv')
training_assignments_cleaned_df = pd.read_csv('../02Prep_Layer/training_assignments_cleaned.csv')
scores_cleaned_df = pd.read_csv('../02Prep_Layer/scores_cleaned.csv')


In [10]:
users_cleaned_df
scores_cleaned_df

Unnamed: 0,value,trainingId,employeeId,trainerId
0,8,19,111,20
1,57,16,111,41
2,53,18,111,72
3,14,4,112,90
4,58,6,112,73
...,...,...,...,...
29995,58,18,10109,72
29996,36,10,10109,106
29997,95,9,10110,63
29998,29,5,10110,42


    Data Type Conversion

In [None]:
users_cleaned_df['id'] = users_cleaned_df['id'].astype(int)
domains_cleaned_df['id'] = domains_cleaned_df['id'].astype(int)
trainings_cleaned_df['id'] = trainings_cleaned_df['id'].astype(int)

trainings_cleaned_df['startDate'] = pd.to_datetime(trainings_cleaned_df['startDate'])
trainings_cleaned_df['endDate'] = pd.to_datetime(trainings_cleaned_df['endDate'])


In [35]:
#Merging Scores table with User Table

merged_scores_df = scores_cleaned_df.merge(users_cleaned_df, left_on='employeeId', right_on='userId', how='left')
merged_scores_df = merged_scores_df.drop(columns='userId')
merged_scores_df

new_order = ['employeeId', 'firstName', 'lastName', 'role', 'value', 'trainingId', 'trainerId']
merged_scores_df = merged_scores_df[new_order]
merged_scores_df.rename(columns={'firstName': 'First Name', 'lastName':'Last Name', 'role':'Role', 'value':'Value'}, inplace=True)
merged_scores_df

Unnamed: 0,employeeId,First Name,Last Name,Role,Value,trainingId,trainerId
0,111,David,Davis,EMPLOYEE,8,19,20
1,111,David,Davis,EMPLOYEE,57,16,41
2,111,David,Davis,EMPLOYEE,53,18,72
3,112,Kenneth,Gonzalez,EMPLOYEE,14,4,90
4,112,Kenneth,Gonzalez,EMPLOYEE,58,6,73
...,...,...,...,...,...,...,...
29995,10109,Jon,Johnson,EMPLOYEE,58,18,72
29996,10109,Jon,Johnson,EMPLOYEE,36,10,106
29997,10110,Cindy,Gonzalez,EMPLOYEE,95,9,63
29998,10110,Cindy,Gonzalez,EMPLOYEE,29,5,42


In [27]:
# Merging scores with users

# merged_scores_df = pd.merge(
#     scores_cleaned_df, 
#     users_cleaned_df[['userId', 'firstName', 'lastName', 'role']], 
#     left_on='employeeId', 
#     right_on='userId', 
#     how='left'
# )

# merged_scores_df

In [36]:
average_scores_df = merged_scores_df.groupby('employeeId').agg(
    average_score=('Value', 'mean'),
    No_of_Trainings=('Value', 'count')
).reset_index()

average_scores_df

Unnamed: 0,employeeId,average_score,No_of_Trainings
0,111,39.333333,3
1,112,30.333333,3
2,113,23.666667,3
3,114,58.333333,3
4,115,33.666667,3
...,...,...,...
9995,10106,54.333333,3
9996,10107,33.666667,3
9997,10108,74.000000,3
9998,10109,38.333333,3


In [40]:
# Pivot the scores for a better view
pivot_scores_df = merged_scores_df.pivot_table(
    index=['employeeId', 'First Name', 'Last Name'], 
    columns='trainingId', 
    values='Value', 
    fill_value=0
).reset_index()


pivot_scores_df

trainingId,employeeId,First Name,Last Name,1,2,3,4,5,6,7,...,11,12,13,14,15,16,17,18,19,20
0,111,David,Davis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,57.0,0.0,53.0,8.0,0.0
1,112,Kenneth,Gonzalez,0.0,0.0,0.0,14.0,0.0,58.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0
2,113,Nathaniel,Henderson,0.0,0.0,0.0,48.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,114,Sergio,Kim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,75.0,100.0,0.0,0.0,0.0,0.0
4,115,Alex,Malone,0.0,60.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,26.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10106,Linda,Moss,75.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0
9996,10107,Matthew,Graves,0.0,0.0,22.0,0.0,0.0,66.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,10108,Daniel,Reyes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,73.0,0.0,0.0,56.0,0.0,93.0,0.0
9998,10109,Jon,Johnson,21.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.0,0.0,0.0


In [43]:
average_scores_df

Unnamed: 0,employeeId,average_score,No_of_Trainings
0,111,39.333333,3
1,112,30.333333,3
2,113,23.666667,3
3,114,58.333333,3
4,115,33.666667,3
...,...,...,...
9995,10106,54.333333,3
9996,10107,33.666667,3
9997,10108,74.000000,3
9998,10109,38.333333,3


In [44]:
# Example: Categorize scores into performance bands
def categorize_score(score):
    if score >= 80:
        return 'Excellent'
    elif score >= 50:
        return 'Average'
    else:
        return 'Poor'

# final_scores_df['performance'] = average_scores_df['average_score'].apply(categorize_score)
average_scores_df['performance'] = average_scores_df['average_score'].apply(categorize_score)
average_scores_df


Unnamed: 0,employeeId,average_score,No_of_Trainings,performance
0,111,39.333333,3,Poor
1,112,30.333333,3,Poor
2,113,23.666667,3,Poor
3,114,58.333333,3,Average
4,115,33.666667,3,Poor
...,...,...,...,...
9995,10106,54.333333,3,Average
9996,10107,33.666667,3,Poor
9997,10108,74.000000,3,Average
9998,10109,38.333333,3,Poor
