    Transform the Data

In [23]:

import pandas as pd

users_cleaned_df = pd.read_csv('../02Prep_Layer/users_cleaned.csv')
domains_cleaned_df = pd.read_csv('../02Prep_Layer/domains_cleaned.csv')
trainings_cleaned_df = pd.read_csv('../02Prep_Layer/trainings_cleaned.csv')
training_assignments_cleaned_df = pd.read_csv('../02Prep_Layer/training_assignments_cleaned.csv')
scores_cleaned_df = pd.read_csv('../02Prep_Layer/scores_cleaned.csv')


In [24]:
users_cleaned_df
domains_cleaned_df
trainings_cleaned_df
scores_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   employeeId        30000 non-null  int64
 1   trainingId        30000 non-null  int64
 2   trainerId         30000 non-null  int64
 3   codeQuality       30000 non-null  int64
 4   communication     30000 non-null  int64
 5   problemSolving    30000 non-null  int64
 6   timeManagement    30000 non-null  int64
 7   assignment_score  30000 non-null  int64
dtypes: int64(8)
memory usage: 1.8 MB


    Data Type Conversion

In [25]:
users_cleaned_df['userId'] = users_cleaned_df['userId'].astype(int)
domains_cleaned_df['domainId'] = domains_cleaned_df['domainId'].astype(int)
trainings_cleaned_df['trainingId'] = trainings_cleaned_df['trainingId'].astype(int)


trainings_cleaned_df['startDate'] = pd.to_datetime(trainings_cleaned_df['startDate'])
trainings_cleaned_df['endDate'] = pd.to_datetime(trainings_cleaned_df['endDate'])


In [26]:
# Merging Scores table with User Table

merged_scores_df = scores_cleaned_df.merge(users_cleaned_df, left_on='employeeId', right_on='userId', how='left')
merged_scores_df = merged_scores_df.drop(columns='userId')
merged_scores_df

new_order = ['employeeId', 'role', 'trainingId', 'assignment_score', 'codeQuality','communication','problemSolving','timeManagement']
merged_scores_df = merged_scores_df[new_order]
# merged_scores_df.rename(columns={'assignment_score':'assignement_score'}, inplace=True)


merged_scores_df.to_csv('./Employees_Scores.csv',index=False)
merged_scores_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   employeeId        30000 non-null  int64 
 1   role              30000 non-null  object
 2   trainingId        30000 non-null  int64 
 3   assignment_score  30000 non-null  int64 
 4   codeQuality       30000 non-null  int64 
 5   communication     30000 non-null  int64 
 6   problemSolving    30000 non-null  int64 
 7   timeManagement    30000 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 1.8+ MB


In [27]:
# Merging scores with users

# merged_scores_df = pd.merge(
#     scores_cleaned_df, 
#     users_cleaned_df[['userId', 'firstName', 'lastName', 'role']], 
#     left_on='employeeId', 
#     right_on='userId', 
#     how='left'
# )

# merged_scores_df

In [28]:
# Pivot the scores for a better view
pivot_scores_df = merged_scores_df.pivot_table(
    index=['employeeId'], 
    columns='trainingId', 
    values='assignment_score', 
    fill_value=0
).reset_index()


pivot_scores_df

trainingId,employeeId,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,221,0.0,0.0,4.0,0.0,0.0,0.0,0.0,25.0,0.0,...,0.0,0.0,0.0,0.0,0.0,70.0,0.0,0.0,0.0,0.0
1,222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
2,223,0.0,0.0,0.0,0.0,0.0,0.0,70.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,224,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,64.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,10217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,10218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,10219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Average score per employee

average_scores_employees_df = merged_scores_df.groupby('employeeId').agg(
    average_score=('assignment_score', 'mean'),
    training_completed=('assignment_score', 'count')
).reset_index().round(2)


# average_scores_employees_df = pd.merge(
#     average_scores_employees_df,
#     merged_scores_df[['employeeId', 'First Name', 'Last Name']],  # Select only relevant columns from users
#     left_on='employeeId',
#     right_on='employeeId',
#     how='left'
# )

# new_order = ['employeeId', 'First Name', 'Last Name', 'average_score', 'No_of_Trainings']
# average_scores_employees_df = average_scores_employees_df[new_order]

average_scores_employees_df

Unnamed: 0,employeeId,average_score,training_completed
0,221,33.00,3
1,222,32.00,3
2,223,55.33,3
3,224,31.00,3
4,225,50.00,3
...,...,...,...
9995,10216,27.33,3
9996,10217,58.00,3
9997,10218,46.00,3
9998,10219,41.67,3


In [30]:
def categorize_score(score):
    if score >= 80:
        return 'Excellent'
    elif score >= 50:
        return 'Average'
    else:
        return 'Poor'

average_scores_employees_df['performance'] = average_scores_employees_df['average_score'].apply(categorize_score)

average_scores_employees_df.to_csv('./Average_Scores_Per_employee.csv',index=False)
average_scores_employees_df


Unnamed: 0,employeeId,average_score,training_completed,performance
0,221,33.00,3,Poor
1,222,32.00,3,Poor
2,223,55.33,3,Average
3,224,31.00,3,Poor
4,225,50.00,3,Average
...,...,...,...,...
9995,10216,27.33,3,Poor
9996,10217,58.00,3,Average
9997,10218,46.00,3,Poor
9998,10219,41.67,3,Poor


In [31]:
# Average score per training session
avg_scores_training_df = merged_scores_df.groupby(['trainingId']).agg(
    average_score= ('assignment_score', 'mean')

).reset_index().round(2)

avg_scores_training_df.head()


Unnamed: 0,trainingId,average_score
0,1,49.63
1,2,50.9
2,3,52.06
3,4,48.4
4,5,51.06


In [32]:
#Top Performers
top_performers_df = average_scores_employees_df.sort_values(by='average_score', ascending=False).head(5)
top_performers_df

top_performers_df.to_csv('./top_performers.csv', index=False)



In [33]:
# Score distribution count
score_distribution_df = average_scores_employees_df['performance'].value_counts().reset_index()
score_distribution_df.columns = ['Performance Level', 'Count']

score_distribution_df


# Save to CSV
score_distribution_df.to_csv('./score_distribution.csv', index=False)


In [34]:
# Employees with an average score below 50
low_performers_df = average_scores_employees_df[average_scores_employees_df['average_score'] < 50]
low_performers_df.value_counts().sum()
low_performers_df

#Find a certain ID
# employee_id_to_find =5603
# employee_row = low_performers_df[low_performers_df['employeeId'] == employee_id_to_find]
# employee_row


# Save to CSV
low_performers_df.to_csv('./low_performers.csv', index=False)


#   Domain Details

In [35]:
domains_cleaned_df

Unnamed: 0,domainId,domainName
0,1,Data Engineering
1,2,Machine Learning
2,3,Full Stack


In [36]:
merged_df = pd.merge(trainings_cleaned_df, domains_cleaned_df, on='domainId')
# print(merged_df)
# merged_df.info()

#Number of Trainings in every domain
training_count = merged_df.groupby(['domainId', 'domainName']).size().reset_index(name='training_count')
# print(training_count)


# duration_stats = merged_df.groupby('domainName')['duration(in days)'].agg(['mean', 'max', 'min']).reset_index()
duration_stats = merged_df.groupby(['domainId', 'domainName'])['duration(in days)'].mean().reset_index(name='avg_duration').round()
# print(duration_stats)

duration_extremes = merged_df.groupby(['domainId', 'domainName'])['duration(in days)'].agg(['max', 'min']).reset_index()
# print(duration_extremes)

overview_df = pd.merge(training_count, duration_stats, on=['domainId', 'domainName'])
overview_df = pd.merge(overview_df, duration_extremes, on=['domainId', 'domainName'])
overview_df = overview_df.rename(columns={'max':'longest_training', 'min':'shortest_traning'})
overview_df




Unnamed: 0,domainId,domainName,training_count,avg_duration,longest_training,shortest_traning
0,1,Data Engineering,15,31.0,58,12
1,2,Machine Learning,15,34.0,60,10
2,3,Full Stack,20,29.0,60,10


In [37]:
training_assignments_cleaned_df.duplicated(subset=['employeeId', 'trainingId']).sum()


0

### Domain Success Rate

In [38]:
score_merged_df = pd.merge(scores_cleaned_df, trainings_cleaned_df[['trainingId', 'domainId']], left_on='trainingId', right_on='trainingId')
score_merged_df

domain_merged_df = pd.merge(score_merged_df, domains_cleaned_df, on='domainId')
domain_merged_df

success_rate = domain_merged_df.groupby(['domainId','domainName']).apply(
    lambda x: (x['assignment_score'] >= 60).mean() * 100
).reset_index(name='success_rate').round(2)

# Display the success rate
print(success_rate)

overview_df = pd.merge(overview_df, success_rate, on=['domainId', 'domainName'])
# overview_df.to_csv("./domain_overview.csv", index=False)



   domainId        domainName  success_rate
0         1  Data Engineering         41.15
1         2  Machine Learning         41.60
2         3        Full Stack         41.13


In [39]:
#Training Overview


merged_scores_df = scores_cleaned_df.merge(users_cleaned_df, left_on='employeeId', right_on='userId', how='left')
merged_scores_df = merged_scores_df.drop(columns='userId')

# Step 2: Find the highest score in each training along with the corresponding employee
highest_scores_df = merged_scores_df.loc[
    merged_scores_df.groupby('trainingId')['assignment_score'].idxmax()
][['trainingId', 'employeeId', 'assignment_score']].rename(
    columns={'assignment_score': 'highest_score'}
)

# Step 3: Calculate the success rate for each training
success_rate_df = merged_scores_df.groupby('trainingId').apply(
    lambda x: (x['assignment_score'] >= 60).mean() * 100
).reset_index(name='success_rate').round(2)

# Step 4: Merge highest scores and success rate data
training_performance_df = pd.merge(highest_scores_df, success_rate_df, on='trainingId')

# Display the final result
# print(training_performance_df)

training_performance_df = training_performance_df.merge(avg_scores_training_df, left_on='trainingId', right_on='trainingId', how='left')

training_performance_df.to_csv('./training_overview.csv', index=False)
training_performance_df


Unnamed: 0,trainingId,employeeId,highest_score,success_rate,average_score
0,1,5100,100,40.07,49.63
1,2,5350,100,40.7,50.9
2,3,604,100,43.89,52.06
3,4,559,100,37.59,48.4
4,5,410,100,42.13,51.06
5,6,889,100,41.73,49.98
6,7,1428,100,42.02,51.26
7,8,5724,100,40.07,49.78
8,9,504,100,38.0,47.82
9,10,1159,100,41.33,50.03
