In [17]:
import random
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
df = pd.read_csv('../data_engineering/data_warehouse/report/employee_performance_kpi.csv')



In [18]:
df.columns

Index(['user_id', 'name', 'designation', 'course_id', 'course_title',
       'course_tag', 'modules_completed', 'total_modules',
       'completion_percentage', 'course_score', 'performance_score'],
      dtype='object')

FEATURE ENGINEERING AND NORMALIZING

In [19]:
label_encoder = LabelEncoder()
df['course_tag_encoded'] = label_encoder.fit_transform(df['course_tag'])

In [20]:
df['raw_score'] = df['completion_percentage'] * df['course_score']

# Step 2: Normalize the score
max_score = df['raw_score'].max()
min_score = df['raw_score'].min()

# Normalization formula: (score - min) / (max - min) * new_max
df['normalized_score'] = (df['raw_score'] - min_score) / (max_score - min_score) 
df.head()

Unnamed: 0,user_id,name,designation,course_id,course_title,course_tag,modules_completed,total_modules,completion_percentage,course_score,performance_score,course_tag_encoded,raw_score,normalized_score
0,66ffada401112f523e6dabeb,Vikas,Web Developer,66ffae1301112f523e6dac0b,HTML,Web Development,2,2,100,65,60,2,6500,0.722222
1,66ffada401112f523e6dabeb,Vikas,Web Developer,67036489a4a675180404ca9f,Docker,DevOps,0,2,0,86,60,1,0,0.0
2,66ffada401112f523e6dabeb,Vikas,Web Developer,67036772a4a675180404cd57,kUBERNETES,DevOps,2,2,100,68,60,1,6800,0.755556
3,66ffada401112f523e6dabeb,Vikas,Web Developer,6702683b8b415e1ee1ae0377,CSS,Web Development,0,2,0,94,60,2,0,0.0
4,66ffada401112f523e6dabeb,Vikas,Web Developer,670269c08b415e1ee1ae040e,Java Script,Web Development,2,2,100,80,60,2,8000,0.888889


In [21]:
required_columns= [
    'user_id','name','course_id','course_title','normalized_score'
]

df_final=df[required_columns]
df_final.head()

len(df_final)

32

In [22]:
df_final.drop_duplicates
len(df_final)

32

In [31]:
df_final_agg = df_final.groupby(['user_id', 'course_title'], as_index=False)['normalized_score'].mean()

# Create a pivot table: user_id as rows, course_title as columns, and normalized_score as values
df_pivot = df_final_agg.pivot(index='user_id', columns='course_title', values='normalized_score').fillna(0)

# Convert the pivot table to a sparse matrix
df_matrix = csr_matrix(df_pivot.values)

# Fit the NearestNeighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(df_matrix)

# Randomly choose an employee for recommendations
query_index = np.random.choice(df_pivot.shape[0])
query_user_id = df_pivot.index[query_index]

# Get the name of the querying user
query_user_name = df_final[df_final['user_id'] == query_user_id]['name'].values[0]
print(f"Query user ID: {query_user_id} (Name: {query_user_name})")

# Get recommendations
distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

# Display nearest employees
print(f'\nNearest Employees for User ID {query_user_id} (Name: {query_user_name}):\n')
recommended_ids = []
for i in range(len(distances.flatten())):
    if i == 0:
        print('Self Match (Distance = 0):')
    else:
        recommended_id = df_pivot.index[indices.flatten()[i]]
        recommended_ids.append(recommended_id)
        user_name = df_final[df_final['user_id'] == recommended_id]['name'].values[0]  # Get the name
        print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')

# Gather courses from nearest employees
all_courses = set()
user_courses = set(df_final[df_final['user_id'] == query_user_id]['course_title'])

for user_id in recommended_ids:
    courses_taken = df_final[df_final['user_id'] == user_id]['course_title'].unique()
    all_courses.update(courses_taken)

# Determine courses to recommend
unique_courses = all_courses.difference(user_courses)

if unique_courses:
    print(f'\nRecommended Courses for User ID {query_user_id} (Name: {query_user_name}) (not previously taken):\n')
    for course in unique_courses:
        print(course)
else:
    # If no unique courses, suggest any course from the nearest employees
    print(f'\nAll courses have been taken by User ID {query_user_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
    suggested_courses = list(all_courses)
    for course in suggested_courses:
        print(course)

# Optionally calculate RMSE (if needed)
def calculate_rmse(recommended_ids, actual_scores):
    relevant_scores = df_final[df_final['user_id'].isin(recommended_ids)]

    if relevant_scores.empty:
        return float('nan')  # Return NaN if no relevant scores are found

    y_true = relevant_scores['normalized_score']
    y_pred = relevant_scores['normalized_score'].mean()  # Using the mean as a simple prediction

    rmse = np.sqrt(mean_squared_error(y_true, [y_pred] * len(y_true)))
    return rmse

# Calculate RMSE (if desired)
rmse_value = calculate_rmse(recommended_ids, df_final)
print(f'\nRMSE: {rmse_value:.4f}')


Query user ID: 670ce3905a9daf534693e06b (Name: Manish)

Nearest Employees for User ID 670ce3905a9daf534693e06b (Name: Manish):

Self Match (Distance = 0):
User ID 66ffbf2c016d5542daee3f00 (Name: Sai), with distance of 1.0000
User ID 670ce3905a9daf534693e065 (Name: Rohit), with distance of 1.0000
User ID 670ce3905a9daf534693e066 (Name: Neha), with distance of 1.0000
User ID 670ce3905a9daf534693e067 (Name: Amit), with distance of 1.0000
User ID 670ce3905a9daf534693e068 (Name: Sonia), with distance of 1.0000

Recommended Courses for User ID 670ce3905a9daf534693e06b (Name: Manish) (not previously taken):

Docker
HTML
Advanced SQL
Web Scraping
kUBERNETES
CSS

RMSE: 0.3717


In [36]:

users_df = pd.read_csv('../data_engineering/data_warehouse/staging/users_staging.csv')

users_df.columns


Index(['user_id', 'name', 'email', 'password', 'role', 'designation',
       'performance_score', 'created_at', '__v'],
      dtype='object')

In [37]:
def recommend_courses_for_user(user_id, df_final, users_df, model_knn, df_pivot):
    # Get user index
    if user_id not in df_pivot.index:
        # print(f"User ID {user_id} not found in pivot table")
        return []

    query_index = df_pivot.index.get_loc(user_id)

    # Get the name of the querying user
    query_user_name = df_final[df_final['user_id'] == user_id]['name'].values[0]
    print(f"\nQuery user ID: {user_id} (Name: {query_user_name})")

    # Get recommendations
    distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

    # Display nearest employees
    print(f'\nNearest Employees for User ID {user_id} (Name: {query_user_name}):\n')
    recommended_ids = []
    for i in range(len(distances.flatten())):
        if i == 0:
            print('Self Match (Distance = 0):')
        else:
            recommended_id = df_pivot.index[indices.flatten()[i]]
            recommended_ids.append(recommended_id)
            user_name = df_final[df_final['user_id'] == recommended_id]['name'].values[0]  # Get the name
            print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')

    # Gather courses from nearest employees
    all_courses = set()
    user_courses = set(df_final[df_final['user_id'] == user_id]['course_title'])

    for user_id in recommended_ids:
        courses_taken = df_final[df_final['user_id'] == user_id]['course_title'].unique()
        all_courses.update(courses_taken)

    # Determine courses to recommend
    unique_courses = all_courses.difference(user_courses)

    if unique_courses:
        print(f'\nRecommended Courses for User ID {user_id} (Name: {query_user_name}) (not previously taken):\n')
        for course in unique_courses:
            print(course)
        return list(unique_courses)
    else:
        # If no unique courses, suggest any course from the nearest employees
        print(f'\nAll courses have been taken by User ID {user_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
        suggested_courses = list(all_courses)
        for course in suggested_courses:
            print(course)
        return suggested_courses

# Function to recommend courses for all users in users_df
def recommend_courses_for_all_users(users_df, df_final, df_pivot, model_knn):
    recommendations = {}
    for user_id in users_df['user_id']:
        recommended_courses = recommend_courses_for_user(user_id, df_final, users_df, model_knn, df_pivot)
        recommendations[user_id] = recommended_courses
    return recommendations

# Example usage:
all_user_recommendations = recommend_courses_for_all_users(users_df, df_final, df_pivot, model_knn)
print("\nCourse recommendations for all users:")
print(all_user_recommendations)


Query user ID: 66ffada401112f523e6dabeb (Name: Vikas)

Nearest Employees for User ID 66ffada401112f523e6dabeb (Name: Vikas):

Self Match (Distance = 0):
User ID 670ce3905a9daf534693e065 (Name: Rohit), with distance of 0.4736
User ID 66ffbf2c016d5542daee3f00 (Name: Sai), with distance of 0.4996
User ID 670ce3905a9daf534693e066 (Name: Neha), with distance of 1.0000
User ID 670ce3905a9daf534693e067 (Name: Amit), with distance of 1.0000
User ID 670ce3905a9daf534693e068 (Name: Sonia), with distance of 1.0000

Recommended Courses for User ID 670ce3905a9daf534693e068 (Name: Vikas) (not previously taken):

Web Scraping
Azure data factory
Advanced SQL

Query user ID: 66ffbf2c016d5542daee3f00 (Name: Sai)

Nearest Employees for User ID 66ffbf2c016d5542daee3f00 (Name: Sai):

Self Match (Distance = 0):
User ID 670ce3905a9daf534693e065 (Name: Rohit), with distance of 0.4575
User ID 670ce3905a9daf534693e066 (Name: Neha), with distance of 0.4635
User ID 66ffada401112f523e6dabeb (Name: Vikas), with di