In [2]:
import random
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
df = pd.read_csv('../data_engineering/data_warehouse/report/employee_performance_kpi.csv')


In [3]:
df.columns

Index(['user_id', 'name', 'designation', 'course_id', 'course_title',
       'course_tag', 'course_duration', 'modules_completed', 'total_modules',
       'completion_percentage', 'course_score', 'performance_score'],
      dtype='object')

FEATURE ENGINEERING AND NORMALIZING

In [4]:
label_encoder = LabelEncoder()
df['course_tag_encoded'] = label_encoder.fit_transform(df['course_tag'])

In [5]:
df['raw_score'] = df['completion_percentage'] * df['course_score']

# Step 2: Normalize the score
max_score = df['raw_score'].max()
min_score = df['raw_score'].min()

# Normalization formula: (score - min) / (max - min) * new_max
df['normalized_score'] = (df['raw_score'] - min_score) / (max_score - min_score) 
df.head()

Unnamed: 0,user_id,name,designation,course_id,course_title,course_tag,course_duration,modules_completed,total_modules,completion_percentage,course_score,performance_score,course_tag_encoded,raw_score,normalized_score
0,0e102525-4073-488a-904e-c5f00a709555,Terry Pena,Software Tester,a7e95994-0a4e-45ca-b383-99d5be874b27,Vision-oriented regional Graphic Interface,UI/UX Design,175.0,4.0,9.0,44.444444,45.0,45.0,7,2000.0,0.26087
1,10411182-6a30-41fb-b24c-1242a632057e,Erik Owens,Data Scientist,a8faead4-39cd-4b23-88e8-26b235b2371b,Synergized coherent synergy,Data Engineering,132.0,5.0,6.0,83.333333,74.0,74.0,1,6166.666667,0.804348
2,16eda4ba-42cd-4b9f-9db9-3a0e0285cab5,Amanda Hayden,Data Scientist,8c614264-41a5-448f-a207-c2a6bc6ed5df,Configurable didactic core,Data Science,168.0,3.0,6.0,50.0,89.0,89.0,2,4450.0,0.580435
3,17c2a5fb-7682-45e3-9bd0-2ddc6431ef00,Carol Howard,Data Scientist,31762771-9880-49ad-95d7-6280e016c4c8,Reactive real-time superstructure,Data Science,119.0,6.0,8.0,75.0,92.0,92.0,2,6900.0,0.9
4,1c4b38aa-0361-49eb-a5bd-372a50bc990e,James Matthews,AI Specialist,8c614264-41a5-448f-a207-c2a6bc6ed5df,Configurable didactic core,Data Science,168.0,2.0,5.0,40.0,56.0,56.0,2,2240.0,0.292174


In [6]:
required_columns= [
    'user_id','name','course_id','course_title','normalized_score'
]

df_final=df[required_columns]
df_final.head()

len(df_final)

99

In [7]:
df_final.drop_duplicates
len(df_final)

99

In [8]:
df_final_agg = df_final.groupby(['user_id', 'course_title'], as_index=False)['normalized_score'].mean()

# Create a pivot table: user_id as rows, course_title as columns, and normalized_score as values
df_pivot = df_final_agg.pivot(index='user_id', columns='course_title', values='normalized_score').fillna(0)

# Convert the pivot table to a sparse matrix
df_matrix = csr_matrix(df_pivot.values)

# Fit the NearestNeighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(df_matrix)

# Randomly choose an employee for recommendations
query_index = np.random.choice(df_pivot.shape[0])
query_user_id = df_pivot.index[query_index]

# Get the name of the querying user
query_user_name = df_final[df_final['user_id'] == query_user_id]['name'].values[0]
print(f"Query user ID: {query_user_id} (Name: {query_user_name})")

# Get recommendations
distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

# Display nearest employees
print(f'\nNearest Employees for User ID {query_user_id} (Name: {query_user_name}):\n')
recommended_ids = []
for i in range(len(distances.flatten())):
    if i == 0:
        print('Self Match (Distance = 0):')
    else:
        recommended_id = df_pivot.index[indices.flatten()[i]]
        recommended_ids.append(recommended_id)
        user_name = df_final[df_final['user_id'] == recommended_id]['name'].values[0]  # Get the name
        print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')

# Gather courses from nearest employees
all_courses = set()
user_courses = set(df_final[df_final['user_id'] == query_user_id]['course_title'])

for user_id in recommended_ids:
    courses_taken = df_final[df_final['user_id'] == user_id]['course_title'].unique()
    all_courses.update(courses_taken)

# Determine courses to recommend
unique_courses = all_courses.difference(user_courses)

if unique_courses:
    print(f'\nRecommended Courses for User ID {query_user_id} (Name: {query_user_name}) (not previously taken):\n')
    for course in unique_courses:
        print(course)
else:
    # If no unique courses, suggest any course from the nearest employees
    print(f'\nAll courses have been taken by User ID {query_user_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
    suggested_courses = list(all_courses)
    for course in suggested_courses:
        print(course)

# Optionally calculate RMSE (if needed)
def calculate_rmse(recommended_ids, actual_scores):
    relevant_scores = df_final[df_final['user_id'].isin(recommended_ids)]

    if relevant_scores.empty:
        return float('nan')  # Return NaN if no relevant scores are found

    y_true = relevant_scores['normalized_score']
    y_pred = relevant_scores['normalized_score'].mean()  # Using the mean as a simple prediction

    rmse = np.sqrt(mean_squared_error(y_true, [y_pred] * len(y_true)))
    return rmse

# Calculate RMSE (if desired)
rmse_value = calculate_rmse(recommended_ids, df_final)
print(f'\nRMSE: {rmse_value:.4f}')


Query user ID: 603b5980-472e-4c94-aa8b-3c66f80bdefb (Name: Rebecca Wilson)

Nearest Employees for User ID 603b5980-472e-4c94-aa8b-3c66f80bdefb (Name: Rebecca Wilson):

Self Match (Distance = 0):
User ID 603b5980-472e-4c94-aa8b-3c66f80bdefb (Name: Rebecca Wilson), with distance of 0.0000
User ID 48247a82-c81c-4e7f-a7ef-00d92a9c6104 (Name: Christina Zamora), with distance of 0.0000
User ID 80c9bae6-95a5-43fc-8ca6-bcd1d53b52a3 (Name: William Robinson), with distance of 0.0000
User ID 1c4b38aa-0361-49eb-a5bd-372a50bc990e (Name: James Matthews), with distance of 1.0000
User ID 10411182-6a30-41fb-b24c-1242a632057e (Name: Erik Owens), with distance of 1.0000

Recommended Courses for User ID 603b5980-472e-4c94-aa8b-3c66f80bdefb (Name: Rebecca Wilson) (not previously taken):

Synergized coherent synergy
Configurable didactic core

RMSE: 0.2304


In [10]:
users_df = pd.read_csv('../data_engineering/data_warehouse/staging/users_staging.csv')
users_df.columns


Index(['user_id', 'name', 'email', 'designation', 'created_at'], dtype='object')

In [14]:
def recommend_courses_for_user(user_id, df_final, users_df, model_knn, df_pivot):
    # Get user index
    if user_id not in df_pivot.index:
        # print(f"User ID {user_id} not found in pivot table")
        return []

    query_index = df_pivot.index.get_loc(user_id)

    # Get the name of the querying user
    query_user_name = df_final[df_final['user_id'] == user_id]['name'].values[0]
    print(f"\nQuery user ID: {user_id} (Name: {query_user_name})")

    # Get recommendations
    distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

    # Display nearest employees
    print(f'\nNearest Employees for User ID {user_id} (Name: {query_user_name}):\n')
    recommended_ids = []
    for i in range(len(distances.flatten())):
        if i == 0:
            print('Self Match (Distance = 0):')
        else:
            recommended_id = df_pivot.index[indices.flatten()[i]]
            recommended_ids.append(recommended_id)
            user_name = df_final[df_final['user_id'] == recommended_id]['name'].values[0]  # Get the name
            print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')

    # Gather courses from nearest employees
    all_courses = set()
    user_courses = set(df_final[df_final['user_id'] == user_id]['course_title'])

    for user_id in recommended_ids:
        courses_taken = df_final[df_final['user_id'] == user_id]['course_title'].unique()
        all_courses.update(courses_taken)

    # Determine courses to recommend
    unique_courses = all_courses.difference(user_courses)

    if unique_courses:
        print(f'\nRecommended Courses for User ID {user_id} (Name: {query_user_name}) (not previously taken):\n')
        for course in unique_courses:
            print(course)
        return list(unique_courses)
    else:
        # If no unique courses, suggest any course from the nearest employees
        print(f'\nAll courses have been taken by User ID {user_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
        suggested_courses = list(all_courses)
        for course in suggested_courses:
            print(course)
        return suggested_courses

# Function to recommend courses for all users in users_df
def recommend_courses_for_all_users(users_df, df_final, df_pivot, model_knn):
    recommendations = {}
    for user_id in users_df['user_id']:
        recommended_courses = recommend_courses_for_user(user_id, df_final, users_df, model_knn, df_pivot)
        recommendations[user_id] = recommended_courses
    return recommendations

# Example usage:
all_user_recommendations = recommend_courses_for_all_users(users_df, df_final, df_pivot, model_knn)
print("\nCourse recommendations for all users:")
print(all_user_recommendations)


Query user ID: 2dd31992-9126-445e-ad42-779e8b16bb10 (Name: Heather Freeman)

Nearest Employees for User ID 2dd31992-9126-445e-ad42-779e8b16bb10 (Name: Heather Freeman):

Self Match (Distance = 0):
User ID e40bba8c-ae9b-4034-8a3b-d51b66db29f0 (Name: Robert Garcia), with distance of 0.5797
User ID 16eda4ba-42cd-4b9f-9db9-3a0e0285cab5 (Name: Amanda Hayden), with distance of 1.0000
User ID 0e102525-4073-488a-904e-c5f00a709555 (Name: Terry Pena), with distance of 1.0000
User ID 1c4b38aa-0361-49eb-a5bd-372a50bc990e (Name: James Matthews), with distance of 1.0000
User ID 1d190915-1796-4d7a-8781-caf004beb2c1 (Name: Mrs. Leslie Juarez), with distance of 1.0000

Recommended Courses for User ID 1d190915-1796-4d7a-8781-caf004beb2c1 (Name: Heather Freeman) (not previously taken):

Extended didactic approach
Configurable didactic core
Right-sized user-facing capability
Expanded full-range protocol
Compatible even-keeled Graphic Interface
Vision-oriented regional Graphic Interface

Query user ID: c6