    Transform the Data

In [2]:

import pandas as pd

users_cleaned_df = pd.read_csv('../02Prep_Layer/users_cleaned.csv')
domains_cleaned_df = pd.read_csv('../02Prep_Layer/domains_cleaned.csv')
trainings_cleaned_df = pd.read_csv('../02Prep_Layer/trainings_cleaned.csv')
training_assignments_cleaned_df = pd.read_csv('../02Prep_Layer/training_assignments_cleaned.csv')
scores_cleaned_df = pd.read_csv('../02Prep_Layer/scores_cleaned.csv')


In [3]:
users_cleaned_df
domains_cleaned_df
trainings_cleaned_df
scores_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   employeeId      30000 non-null  int64
 1   trainingId      30000 non-null  int64
 2   trainerId       30000 non-null  int64
 3   codeQuality     30000 non-null  int64
 4   communication   30000 non-null  int64
 5   problemSolving  30000 non-null  int64
 6   timeManagement  30000 non-null  int64
 7   value           30000 non-null  int64
dtypes: int64(8)
memory usage: 1.8 MB


    Data Type Conversion

In [4]:
users_cleaned_df['userId'] = users_cleaned_df['userId'].astype(int)
domains_cleaned_df['domainId'] = domains_cleaned_df['domainId'].astype(int)
trainings_cleaned_df['trainingId'] = trainings_cleaned_df['trainingId'].astype(int)


trainings_cleaned_df['startDate'] = pd.to_datetime(trainings_cleaned_df['startDate'])
trainings_cleaned_df['endDate'] = pd.to_datetime(trainings_cleaned_df['endDate'])


In [50]:
# Merging Scores table with User Table

merged_scores_df = scores_cleaned_df.merge(users_cleaned_df, left_on='employeeId', right_on='userId', how='left')
merged_scores_df = merged_scores_df.drop(columns='userId')
merged_scores_df

new_order = ['employeeId', 'role', 'trainingId', 'value', 'codeQuality','communication','problemSolving','timeManagement']
merged_scores_df = merged_scores_df[new_order]
merged_scores_df.rename(columns={'value':'assignement_score'}, inplace=True)


merged_scores_df.to_csv('./Employees_Scores.csv',index=False)
merged_scores_df

Unnamed: 0,employeeId,role,trainingId,assignement_score,codeQuality,communication,problemSolving,timeManagement
0,221,EMPLOYEE,45,72,68,64,76,57
1,221,EMPLOYEE,17,42,90,68,57,19
2,221,EMPLOYEE,5,75,100,73,34,44
3,222,EMPLOYEE,1,71,58,68,51,53
4,222,EMPLOYEE,45,76,86,52,14,37
...,...,...,...,...,...,...,...,...
29995,10219,EMPLOYEE,24,90,16,59,18,53
29996,10219,EMPLOYEE,6,31,97,36,19,38
29997,10220,EMPLOYEE,38,45,71,84,43,38
29998,10220,EMPLOYEE,19,18,78,20,73,49


In [51]:
# Merging scores with users

# merged_scores_df = pd.merge(
#     scores_cleaned_df, 
#     users_cleaned_df[['userId', 'firstName', 'lastName', 'role']], 
#     left_on='employeeId', 
#     right_on='userId', 
#     how='left'
# )

# merged_scores_df

In [52]:
# Pivot the scores for a better view
pivot_scores_df = merged_scores_df.pivot_table(
    index=['employeeId'], 
    columns='trainingId', 
    values='assignement_score', 
    fill_value=0
).reset_index()


pivot_scores_df

trainingId,employeeId,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,221,0.0,0.0,0.0,0.0,75.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,72.0,0.0,0.0,0.0,0.0,0.0
1,222,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,76.0,0.0,0.0,0.0,0.0,0.0
2,223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,224,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10216,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,10217,0.0,0.0,66.0,0.0,84.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,10218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,10219,0.0,0.0,0.0,0.0,0.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.0,0.0,0.0


In [53]:
# Average score per employee

average_scores_employees_df = merged_scores_df.groupby('employeeId').agg(
    average_score=('assignement_score', 'mean'),
    No_of_Trainings=('assignement_score', 'count')
).reset_index().round(2)


# average_scores_employees_df = pd.merge(
#     average_scores_employees_df,
#     merged_scores_df[['employeeId', 'First Name', 'Last Name']],  # Select only relevant columns from users
#     left_on='employeeId',
#     right_on='employeeId',
#     how='left'
# )

# new_order = ['employeeId', 'First Name', 'Last Name', 'average_score', 'No_of_Trainings']
# average_scores_employees_df = average_scores_employees_df[new_order]

average_scores_employees_df

Unnamed: 0,employeeId,average_score,No_of_Trainings
0,221,63.00,3
1,222,68.67,3
2,223,60.33,3
3,224,39.33,3
4,225,60.67,3
...,...,...,...
9995,10216,24.67,3
9996,10217,70.33,3
9997,10218,27.33,3
9998,10219,61.00,3


In [9]:
def categorize_score(score):
    if score >= 80:
        return 'Excellent'
    elif score >= 50:
        return 'Average'
    else:
        return 'Poor'

average_scores_employees_df['performance'] = average_scores_employees_df['average_score'].apply(categorize_score)

average_scores_employees_df.to_csv('./Average_Scores_Per_employee.csv',index=False)
average_scores_employees_df


Unnamed: 0,employeeId,average_score,No_of_Trainings,performance
0,221,63.00,3,Average
1,222,68.67,3,Average
2,223,60.33,3,Average
3,224,39.33,3,Poor
4,225,60.67,3,Average
...,...,...,...,...
9995,10216,24.67,3,Poor
9996,10217,70.33,3,Average
9997,10218,27.33,3,Poor
9998,10219,61.00,3,Average


In [None]:
# Average score per training session
avg_scores_training_df = merged_scores_df.groupby(['trainingId']).agg(
    average_score= ('assignement_score', 'mean')

).reset_index().round(2)

avg_scores_training_df.head()


In [11]:
#Top Performers
top_performers_df = average_scores_employees_df.sort_values(by='average_score', ascending=False).head(5)
top_performers_df

top_performers_df.to_csv('./top_performers.csv', index=False)



In [12]:
# Score distribution count
score_distribution_df = average_scores_employees_df['performance'].value_counts().reset_index()
score_distribution_df.columns = ['Performance Level', 'Count']

score_distribution_df


# Save to CSV
score_distribution_df.to_csv('./score_distribution.csv', index=False)


In [13]:
# Employees with an average score below 50
low_performers_df = average_scores_employees_df[average_scores_employees_df['average_score'] < 50]
low_performers_df.value_counts().sum()
low_performers_df

#Find a certain ID
# employee_id_to_find =5603
# employee_row = low_performers_df[low_performers_df['employeeId'] == employee_id_to_find]
# employee_row


# Save to CSV
low_performers_df.to_csv('./low_performers.csv', index=False)


#   Domain Details

In [14]:
domains_cleaned_df

Unnamed: 0,domainId,domainName
0,1,Data Engineering
1,2,Machine Learning
2,3,Full Stack


In [48]:
merged_df = pd.merge(trainings_cleaned_df, domains_cleaned_df, on='domainId')
# print(merged_df)
# merged_df.info()

#Number of Trainings in every domain
training_count = merged_df.groupby(['domainId', 'domainName']).size().reset_index(name='training_count')
# print(training_count)


# duration_stats = merged_df.groupby('domainName')['duration(in days)'].agg(['mean', 'max', 'min']).reset_index()
duration_stats = merged_df.groupby(['domainId', 'domainName'])['duration(in days)'].mean().reset_index(name='avg_duration').round()
# print(duration_stats)

duration_extremes = merged_df.groupby(['domainId', 'domainName'])['duration(in days)'].agg(['max', 'min']).reset_index()
# print(duration_extremes)

overview_df = pd.merge(training_count, duration_stats, on=['domainId', 'domainName'])
overview_df = pd.merge(overview_df, duration_extremes, on=['domainId', 'domainName'])
overview_df = overview_df.rename(columns={'max':'longest_training', 'min':'shortest_traning'})
overview_df

overview_df.to_csv("./domain_overview.csv", index=False)


In [40]:
training_assignments_cleaned_df.duplicated(subset=['employeeId', 'trainingId']).sum()



0

### Domain Success Rate