In [5]:
import random
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
df = pd.read_csv('../data_engineering/data_warehouse/report/employee_performance_kpi.csv')


In [6]:
df.columns

Index(['user_id', 'name', 'designation', 'course_id', 'course_title',
       'course_tag', 'course_duration', 'modules_completed', 'total_modules',
       'completion_percentage', 'course_score', 'performance_score'],
      dtype='object')

FEATURE ENGINEERING AND NORMALIZING

In [7]:
label_encoder = LabelEncoder()
df['course_tag_encoded'] = label_encoder.fit_transform(df['course_tag'])

In [8]:
df['raw_score'] = df['completion_percentage'] * df['course_score']

# Step 2: Normalize the score
max_score = df['raw_score'].max()
min_score = df['raw_score'].min()

# Normalization formula: (score - min) / (max - min) * new_max
df['normalized_score'] = (df['raw_score'] - min_score) / (max_score - min_score) 
df.head()

Unnamed: 0,user_id,name,designation,course_id,course_title,course_tag,course_duration,modules_completed,total_modules,completion_percentage,course_score,performance_score,course_tag_encoded,raw_score,normalized_score
0,0564dc8a-2c53-44ac-b2ec-499755de840f,Shawna Flores,UI/UX Designer,4f3eb0d3-dd92-4afe-9c6c-cd1634770a52,Polarized static conglomeration,Mobile Development,117.0,4.0,7.0,57.142857,73.0,51.4,5,4171.428571,0.421356
1,0564dc8a-2c53-44ac-b2ec-499755de840f,Shawna Flores,UI/UX Designer,5233bd5a-1fb4-4022-9ef4-9885ad9b99f4,Networked analyzing initiative,Software Testing,108.0,3.0,9.0,33.333333,60.0,51.4,6,2000.0,0.20202
2,0564dc8a-2c53-44ac-b2ec-499755de840f,Shawna Flores,UI/UX Designer,c388b418-ba14-4b9e-ae1f-b6ec6159ab06,Down-sized demand-driven paradigm,Generative AI,107.0,4.0,9.0,44.444444,26.0,51.4,4,1155.555556,0.116723
3,07f6381b-edfe-4f3e-b485-0fc0d9fb6e97,Ryan Humphrey,AI Specialist,8e577a9b-05fd-42e0-b632-df1326c8f571,Optimized system-worthy concept,Mobile Development,159.0,2.0,6.0,33.333333,45.0,70.307692,5,1500.0,0.151515
4,07f6381b-edfe-4f3e-b485-0fc0d9fb6e97,Ryan Humphrey,AI Specialist,a83b5073-16b7-4acd-b748-514eb1a25314,Public-key grid-enabled strategy,DevOps,106.0,2.0,7.0,28.571429,92.0,70.307692,3,2628.571429,0.265512


In [9]:
required_columns= [
    'user_id','name','course_id','course_title','normalized_score'
]

df_final=df[required_columns]
df_final.head()

len(df_final)

100

In [10]:
df_final.drop_duplicates
len(df_final)

100

In [27]:
df_final_agg = df_final.groupby(['user_id', 'course_title'], as_index=False)['normalized_score'].mean()

# Create a pivot table: user_id as rows, course_title as columns, and normalized_score as values
df_pivot = df_final_agg.pivot(index='user_id', columns='course_title', values='normalized_score').fillna(0)

# Convert the pivot table to a sparse matrix
df_matrix = csr_matrix(df_pivot.values)

# Fit the NearestNeighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(df_matrix)

# Randomly choose an employee for recommendations
query_index = np.random.choice(df_pivot.shape[0])
query_user_id = df_pivot.index[query_index]

# Get the name of the querying user
query_user_name = df_final[df_final['user_id'] == query_user_id]['name'].values[0]
print(f"Query user ID: {query_user_id} (Name: {query_user_name})")

# Get recommendations
distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

# Display nearest employees
print(f'\nNearest Employees for User ID {query_user_id} (Name: {query_user_name}):\n')
recommended_ids = []
for i in range(len(distances.flatten())):
    if i == 0:
        print('Self Match (Distance = 0):')
    else:
        recommended_id = df_pivot.index[indices.flatten()[i]]
        recommended_ids.append(recommended_id)
        user_name = df_final[df_final['user_id'] == recommended_id]['name'].values[0]  # Get the name
        print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')

# Gather courses from nearest employees
all_courses = set()
user_courses = set(df_final[df_final['user_id'] == query_user_id]['course_title'])

for user_id in recommended_ids:
    courses_taken = df_final[df_final['user_id'] == user_id]['course_title'].unique()
    all_courses.update(courses_taken)

# Determine courses to recommend
unique_courses = all_courses.difference(user_courses)

if unique_courses:
    print(f'\nRecommended Courses for User ID {query_user_id} (Name: {query_user_name}) (not previously taken):\n')
    for course in unique_courses:
        print(course)
else:
    # If no unique courses, suggest any course from the nearest employees
    print(f'\nAll courses have been taken by User ID {query_user_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
    suggested_courses = list(all_courses)
    for course in suggested_courses:
        print(course)

# Optionally calculate RMSE (if needed)
def calculate_rmse(recommended_ids, actual_scores):
    relevant_scores = df_final[df_final['user_id'].isin(recommended_ids)]

    if relevant_scores.empty:
        return float('nan')  # Return NaN if no relevant scores are found

    y_true = relevant_scores['normalized_score']
    y_pred = relevant_scores['normalized_score'].mean()  # Using the mean as a simple prediction

    rmse = np.sqrt(mean_squared_error(y_true, [y_pred] * len(y_true)))
    return rmse

# Calculate RMSE (if desired)
rmse_value = calculate_rmse(recommended_ids, df_final)
print(f'\nRMSE: {rmse_value:.4f}')


Query user ID: a769e100-ae92-4a0a-96d0-628ce6c9c579 (Name: Bethany Pollard)

Nearest Employees for User ID a769e100-ae92-4a0a-96d0-628ce6c9c579 (Name: Bethany Pollard):

Self Match (Distance = 0):
User ID d7ced9c9-c125-45b2-91c7-241cf90e006d (Name: Hannah Patel), with distance of 0.5793
User ID 38995514-089d-469e-b7e0-d72ef1f642d3 (Name: Shirley Miller), with distance of 0.6210
User ID f884a6af-afcf-4dd1-ae84-8f5f02804ede (Name: Christopher Melendez), with distance of 0.6395
User ID 07f6381b-edfe-4f3e-b485-0fc0d9fb6e97 (Name: Ryan Humphrey), with distance of 0.7915
User ID 0d525492-03a0-4e66-87d9-f4d4df10aa24 (Name: Justin Potts), with distance of 1.0000

Recommended Courses for User ID a769e100-ae92-4a0a-96d0-628ce6c9c579 (Name: Bethany Pollard) (not previously taken):

Focused even-keeled instruction set
Public-key grid-enabled strategy
Future-proofed responsive model
Automated 5thgeneration task-force

RMSE: 0.1755
