In [48]:
import random
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv('../data_engineering/data_warehouse/report/employee_performance_kpi.csv')

In [49]:
df.columns

Index(['user_id', 'name', 'designation', 'course_id', 'course_title',
       'course_tag', 'course_duration', 'modules_completed', 'total_modules',
       'completion_percentage', 'course_score', 'performance_score'],
      dtype='object')

FEATURE ENGINEERING AND NORMALIZING

In [50]:
label_encoder = LabelEncoder()
df['course_tag_encoded'] = label_encoder.fit_transform(df['course_tag'])

In [51]:
df['raw_score'] = df['completion_percentage'] * df['course_score']

# Step 2: Normalize the score
max_score = df['raw_score'].max()
min_score = df['raw_score'].min()

# Normalization formula: (score - min) / (max - min) * new_max
df['normalized_score'] = (df['raw_score'] - min_score) / (max_score - min_score) 
df.head()

Unnamed: 0,user_id,name,designation,course_id,course_title,course_tag,course_duration,modules_completed,total_modules,completion_percentage,course_score,performance_score,course_tag_encoded,raw_score,normalized_score
0,0e102525-4073-488a-904e-c5f00a709555,Terry Pena,Software Tester,a7e95994-0a4e-45ca-b383-99d5be874b27,Vision-oriented regional Graphic Interface,UI/UX Design,175.0,4.0,9.0,44.44,45.0,45.0,7,1999.8,0.260854
1,10411182-6a30-41fb-b24c-1242a632057e,Erik Owens,Data Scientist,a8faead4-39cd-4b23-88e8-26b235b2371b,Synergized coherent synergy,Data Engineering,132.0,5.0,6.0,83.33,74.0,74.0,1,6166.42,0.804348
2,16eda4ba-42cd-4b9f-9db9-3a0e0285cab5,Amanda Hayden,Data Scientist,8c614264-41a5-448f-a207-c2a6bc6ed5df,Configurable didactic core,Data Science,168.0,3.0,6.0,50.0,89.0,89.0,2,4450.0,0.580458
3,17c2a5fb-7682-45e3-9bd0-2ddc6431ef00,Carol Howard,Data Scientist,31762771-9880-49ad-95d7-6280e016c4c8,Reactive real-time superstructure,Data Science,119.0,6.0,8.0,75.0,92.0,92.0,2,6900.0,0.900036
4,1c4b38aa-0361-49eb-a5bd-372a50bc990e,James Matthews,AI Specialist,8c614264-41a5-448f-a207-c2a6bc6ed5df,Configurable didactic core,Data Science,168.0,2.0,5.0,40.0,56.0,56.0,2,2240.0,0.292186


In [52]:
required_columns= [
    'user_id','name','course_id','course_title','normalized_score'
]

df_final=df[required_columns]
df_final.head()

len(df_final)

99

In [53]:
df_final.drop_duplicates
len(df_final)

99

KNN ALGORITHM

In [54]:
df_final_agg = df_final.groupby(['user_id', 'course_title'], as_index=False)['normalized_score'].mean()

# Create a pivot table: user_id as rows, course_title as columns, and normalized_score as values
df_pivot = df_final_agg.pivot(index='user_id', columns='course_title', values='normalized_score').fillna(0)

# Convert the pivot table to a sparse matrix
df_matrix = csr_matrix(df_pivot.values)

# Fit the NearestNeighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(df_matrix)

# Randomly choose an employee for recommendations
query_index = np.random.choice(df_pivot.shape[0])
query_user_id = df_pivot.index[query_index]

# Get the name of the querying user
query_user_name = df_final[df_final['user_id'] == query_user_id]['name'].values[0]
print(f"Query user ID: {query_user_id} (Name: {query_user_name})")

# Get recommendations
distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

# Display nearest employees
print(f'\nNearest Employees for User ID {query_user_id} (Name: {query_user_name}):\n')
recommended_ids = []
for i in range(len(distances.flatten())):
    if i == 0:
        print('Self Match (Distance = 0):')
    else:
        recommended_id = df_pivot.index[indices.flatten()[i]]
        recommended_ids.append(recommended_id)
        user_name = df_final[df_final['user_id'] == recommended_id]['name'].values[0]  # Get the name
        print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')

# Gather courses from nearest employees
all_courses = set()
user_courses = set(df_final[df_final['user_id'] == query_user_id]['course_title'])

for user_id in recommended_ids:
    courses_taken = df_final[df_final['user_id'] == user_id]['course_title'].unique()
    all_courses.update(courses_taken)

# Determine courses to recommend
unique_courses = all_courses.difference(user_courses)

if unique_courses:
    print(f'\nRecommended Courses for User ID {query_user_id} (Name: {query_user_name}) (not previously taken):\n')
    for course in unique_courses:
        print(course)
else:
    # If no unique courses, suggest any course from the nearest employees
    print(f'\nAll courses have been taken by User ID {query_user_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
    suggested_courses = list(all_courses)
    for course in suggested_courses:
        print(course)

# Optionally calculate RMSE (if needed)
def calculate_rmse(recommended_ids, actual_scores):
    relevant_scores = df_final[df_final['user_id'].isin(recommended_ids)]

    if relevant_scores.empty:
        return float('nan')  # Return NaN if no relevant scores are found

    y_true = relevant_scores['normalized_score']
    y_pred = relevant_scores['normalized_score'].mean()  # Using the mean as a simple prediction

    rmse = np.sqrt(mean_squared_error(y_true, [y_pred] * len(y_true)))
    return rmse

# Calculate RMSE (if desired)
rmse_value = calculate_rmse(recommended_ids, df_final)
print(f'\nRMSE: {rmse_value:.4f}')


Query user ID: 6c7d0eb6-cf70-47f0-a231-2ea13472a27d (Name: Crystal Doyle)

Nearest Employees for User ID 6c7d0eb6-cf70-47f0-a231-2ea13472a27d (Name: Crystal Doyle):

Self Match (Distance = 0):
User ID b857e691-da88-48fa-a89e-88b2b1ccdcb5 (Name: Eric Stevenson), with distance of 0.0000
User ID 769946e5-25bd-4af4-bc79-0fe0c030d392 (Name: Alicia Taylor), with distance of 0.0000
User ID 6c7d0eb6-cf70-47f0-a231-2ea13472a27d (Name: Crystal Doyle), with distance of 0.0000
User ID 0e102525-4073-488a-904e-c5f00a709555 (Name: Terry Pena), with distance of 0.0000
User ID b3a28440-12f4-4d6b-80dc-e60f91dca2f4 (Name: Christy Jordan), with distance of 0.2422

Recommended Courses for User ID 6c7d0eb6-cf70-47f0-a231-2ea13472a27d (Name: Crystal Doyle) (not previously taken):

Cross-group fresh-thinking knowledge user
Configurable didactic core

RMSE: 0.2543


In [55]:

users_df = pd.read_csv('../data_engineering/data_warehouse/staging/users_staging.csv')

users_df.columns


Index(['user_id', 'name', 'email', 'designation', 'created_at'], dtype='object')

In [56]:
def recommend_courses_for_user(user_id, df_final, users_df, model_knn, df_pivot):
    # Get user index
    if user_id not in df_pivot.index:
        # print(f"User ID {user_id} not found in pivot table")
        return []

    query_index = df_pivot.index.get_loc(user_id)

    # Get the name of the querying user
    query_user_name = df_final[df_final['user_id'] == user_id]['name'].values[0]
    print(f"\nQuery user ID: {user_id} (Name: {query_user_name})")

    # Get recommendations
    distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

    # Display nearest employees
    print(f'\nNearest Employees for User ID {user_id} (Name: {query_user_name}):\n')
    recommended_ids = []
    for i in range(len(distances.flatten())):
        if i == 0:
            print('Self Match (Distance = 0):')
        else:
            recommended_id = df_pivot.index[indices.flatten()[i]]
            recommended_ids.append(recommended_id)
            user_name = df_final[df_final['user_id'] == recommended_id]['name'].values[0]  # Get the name
            print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')

    # Gather courses from nearest employees
    all_courses = set()
    user_courses = set(df_final[df_final['user_id'] == user_id]['course_title'])

    for user_id in recommended_ids:
        courses_taken = df_final[df_final['user_id'] == user_id]['course_title'].unique()
        all_courses.update(courses_taken)

    # Determine courses to recommend
    unique_courses = all_courses.difference(user_courses)

    if unique_courses:
        print(f'\nRecommended Courses for User ID {user_id} (Name: {query_user_name}) (not previously taken):\n')
        for course in unique_courses:
            print(course)
        return list(unique_courses)
    else:
        # If no unique courses, suggest any course from the nearest employees
        print(f'\nAll courses have been taken by User ID {user_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
        suggested_courses = list(all_courses)
        for course in suggested_courses:
            print(course)
        return suggested_courses

# Function to recommend courses for all users in users_df
def recommend_courses_for_all_users(users_df, df_final, df_pivot, model_knn):
    recommendations = {}
    for user_id in users_df['user_id']:
        recommended_courses = recommend_courses_for_user(user_id, df_final, users_df, model_knn, df_pivot)
        recommendations[user_id] = recommended_courses
    return recommendations

# Example usage:
all_user_recommendations = recommend_courses_for_all_users(users_df, df_final, df_pivot, model_knn)
print("\nCourse recommendations for all users:")
print(all_user_recommendations)


Query user ID: 2dd31992-9126-445e-ad42-779e8b16bb10 (Name: Heather Freeman)

Nearest Employees for User ID 2dd31992-9126-445e-ad42-779e8b16bb10 (Name: Heather Freeman):

Self Match (Distance = 0):
User ID e40bba8c-ae9b-4034-8a3b-d51b66db29f0 (Name: Robert Garcia), with distance of 0.5797
User ID b3a28440-12f4-4d6b-80dc-e60f91dca2f4 (Name: Christy Jordan), with distance of 1.0000
User ID b7154f0a-c47f-4f45-894d-4208cac1e1f8 (Name: Anthony Hunter), with distance of 1.0000
User ID b8320aaf-af61-4f59-a6f4-ed314015fac9 (Name: Christopher Jones), with distance of 1.0000
User ID b799be0a-4072-45d5-a159-80edc82c3f69 (Name: Travis Davis), with distance of 1.0000

Recommended Courses for User ID b799be0a-4072-45d5-a159-80edc82c3f69 (Name: Heather Freeman) (not previously taken):

Cross-group fresh-thinking knowledge user
Vision-oriented regional Graphic Interface
Extended didactic approach
Configurable didactic core
Expanded full-range protocol
Decentralized maximized data-warehouse
Right-sized

In [57]:
data = pd.read_csv("../data_engineering/data_warehouse/report/employee_performance_kpi.csv")
print(data.columns)
data.head()

Index(['user_id', 'name', 'designation', 'course_id', 'course_title',
       'course_tag', 'course_duration', 'modules_completed', 'total_modules',
       'completion_percentage', 'course_score', 'performance_score'],
      dtype='object')


Unnamed: 0,user_id,name,designation,course_id,course_title,course_tag,course_duration,modules_completed,total_modules,completion_percentage,course_score,performance_score
0,0e102525-4073-488a-904e-c5f00a709555,Terry Pena,Software Tester,a7e95994-0a4e-45ca-b383-99d5be874b27,Vision-oriented regional Graphic Interface,UI/UX Design,175.0,4.0,9.0,44.44,45.0,45.0
1,10411182-6a30-41fb-b24c-1242a632057e,Erik Owens,Data Scientist,a8faead4-39cd-4b23-88e8-26b235b2371b,Synergized coherent synergy,Data Engineering,132.0,5.0,6.0,83.33,74.0,74.0
2,16eda4ba-42cd-4b9f-9db9-3a0e0285cab5,Amanda Hayden,Data Scientist,8c614264-41a5-448f-a207-c2a6bc6ed5df,Configurable didactic core,Data Science,168.0,3.0,6.0,50.0,89.0,89.0
3,17c2a5fb-7682-45e3-9bd0-2ddc6431ef00,Carol Howard,Data Scientist,31762771-9880-49ad-95d7-6280e016c4c8,Reactive real-time superstructure,Data Science,119.0,6.0,8.0,75.0,92.0,92.0
4,1c4b38aa-0361-49eb-a5bd-372a50bc990e,James Matthews,AI Specialist,8c614264-41a5-448f-a207-c2a6bc6ed5df,Configurable didactic core,Data Science,168.0,2.0,5.0,40.0,56.0,56.0


COLLABORATIVE FILTERING 

In [58]:
# Encode course tags to recommend based on similarity
label_encoder = LabelEncoder()
data["course_tag_encoded"] = label_encoder.fit_transform(data["course_tag"])

# Create a pivot table of employees and courses, using course_score as values
pivot_table = data.pivot_table(
    index="user_id", columns="course_id", values="course_score"
).fillna(0)

# Calculate cosine similarity between employees based on their course performance
employee_similarity = cosine_similarity(pivot_table)
employee_similarity_df = pd.DataFrame(
    employee_similarity, index=pivot_table.index, columns=pivot_table.index
)


# Function to get top N similar employees
def get_top_similar_employees(user_id, top_n=3):
    similar_scores = employee_similarity_df[user_id]
    top_similar_employees = (
        similar_scores.sort_values(ascending=False).iloc[1 : top_n + 1].index
    )
    return top_similar_employees


# Get courses to recommend for an employee (those they have not already completed)
def recommend_courses(employee_id, top_n=3):
    # Get similar employees (for the sake of example, assuming 'get_top_similar_employees' works)
    similar_employees = get_top_similar_employees(employee_id, top_n)

    # Get courses completed by similar employees that the current employee hasn't completed
    employee_courses = set(data[data["user_id"] == employee_id]["course_id"])
    similar_employee_courses = data[data["user_id"].isin(similar_employees)][
        "course_id"
    ].unique()

    # Filter out courses already completed by the current employee
    recommended_courses = [
        course for course in similar_employee_courses if course not in employee_courses
    ]

    # Fetch course titles and IDs for recommendations
    recommended_course_details = data[data["course_id"].isin(recommended_courses)][
        ["course_title", "course_id"]
    ].drop_duplicates()

    return recommended_course_details


# Generate and print recommendations for all employees
for user_id in users_df["user_id"]:
    employee_id = user_id

    # Check if the employee exists in the similarity matrix
    if employee_id not in employee_similarity_df.index:
        print(f"Employee {employee_id} not found in similarity matrix.")
        continue

    # Get the employee's name
    employee_name = data[data["user_id"] == employee_id]["name"].iloc[0]

    # Generate recommendations
    try:
        recommendations = recommend_courses(employee_id)
        if recommendations.empty:
            print(
                f"No new course recommendations for Employee {employee_name} (ID: {employee_id})."
            )
        else:
            print(f"Recommendations for Employee {employee_name} (ID: {employee_id}):")
            for _, row in recommendations.iterrows():
                print(
                    f"  - Course Title: {row['course_title']}, Course ID: {row['course_id']}"
                )
            print(
                "\n"
            )  # Add a newline for better readability between employees' recommendations
    except Exception as e:
        print(f"Error generating recommendations for {employee_id}: {e}")

Employee f19bff61-81de-4e03-8147-008253a87032 not found in similarity matrix.
Employee 2f53cffa-1270-4f2e-9e01-726bc3bfed45 not found in similarity matrix.
Employee 1ce90841-2636-451c-bb17-b52b3e957885 not found in similarity matrix.
Recommendations for Employee Heather Freeman (ID: 2dd31992-9126-445e-ad42-779e8b16bb10):
  - Course Title: Reactive background matrices, Course ID: b4c47ef9-ab01-459e-8d3f-51b60dbea9ab
  - Course Title: Optional content-based matrix, Course ID: a78b1ec0-ce93-4128-868c-66f14f6c2956
  - Course Title: Reactive transitional database, Course ID: caf6a647-1ecb-40f5-bf37-025d5a3dac92
  - Course Title: Profit-focused reciprocal interface, Course ID: 00ac38e0-4297-47bd-bfc4-11e55ea46076
  - Course Title: Expanded full-range protocol, Course ID: 456db36d-de96-4dfa-865f-504165cce6c9
  - Course Title: Right-sized user-facing capability, Course ID: d42a0ca8-2308-4124-9397-81be2ba03c55
  - Course Title: Extended didactic approach, Course ID: 7aa5f280-dc8b-4b39-866d-ef08

CONTENT-BASED FILTERING

In [59]:
df = pd.read_csv("../data_engineering/data_warehouse/report/employee_performance_kpi.csv")
print(df.columns)
#df.head()

Index(['user_id', 'name', 'designation', 'course_id', 'course_title',
       'course_tag', 'course_duration', 'modules_completed', 'total_modules',
       'completion_percentage', 'course_score', 'performance_score'],
      dtype='object')


In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine course_tag and designation into a single feature
df["combined_tags"] = df["course_tag"] + " " + df["designation"]

# Replace spaces with underscores in the 'combined_tags' column for multi-word terms
df["combined_tags"] = df["combined_tags"].str.replace(" ", "_")

# Vectorizing the course tags and designations
vectorizer = TfidfVectorizer()
course_vectors = vectorizer.fit_transform(df["combined_tags"])

# Convert the vectors into a DataFrame
course_vectors_df = pd.DataFrame(
    course_vectors.toarray(),
    index=df["course_id"],
    columns=vectorizer.get_feature_names_out(),
)

# Create employee profiles based on their completed courses
employee_profiles = df.groupby("user_id")["combined_tags"].apply(lambda x: " ".join(x))

# Vectorize employee profiles
employee_vectors = vectorizer.transform(employee_profiles)

# Convert employee vectors to DataFrame
employee_vectors_df = pd.DataFrame(
    employee_vectors.toarray(),
    index=employee_profiles.index,
    columns=vectorizer.get_feature_names_out(),
)


# Compute cosine similarity between employees and courses
similarity_matrix = cosine_similarity(employee_vectors_df, course_vectors_df)

# Convert similarity matrix to DataFrame
similarity_df = pd.DataFrame(
    similarity_matrix, index=employee_profiles.index, columns=df["course_id"]
)

# View similarity scores
# print(similarity_df)


# Function to recommend top N courses based on similarity
def recommend_courses(employee_id, top_n=3):
    # Sort courses by similarity for the given employee
    sorted_courses = similarity_df.loc[employee_id].sort_values(ascending=False)

    # Filter out courses that the employee has already taken
    assigned_courses = df[df["user_id"] == employee_id]["course_id"].tolist()
    recommendations = sorted_courses.index.difference(assigned_courses)

    # Get top N recommended courses (course_id and course_title)
    top_recommendations = df[df["course_id"].isin(recommendations)].head(top_n)

    return top_recommendations[["course_id", "course_title"]]


# Generate and print recommendations for all employees
for employee_id in employee_profiles.index:
    # Get employee name
    employee_name = df[df["user_id"] == employee_id]["name"].iloc[0]

    # Get course recommendations
    recommendations = recommend_courses(employee_id)

    # Print employee name, employee_id, and course recommendations
    print(f"Recommendations for Employee: {employee_name} (ID: {employee_id})")
    for _, row in recommendations.iterrows():
        print(f"  - Course Title: {row['course_title']}, Course ID: {row['course_id']}")
    print("\n")  # Add a newline for better readability between employee recommendations

Recommendations for Employee: Terry Pena (ID: 0e102525-4073-488a-904e-c5f00a709555)
  - Course Title: Synergized coherent synergy, Course ID: a8faead4-39cd-4b23-88e8-26b235b2371b
  - Course Title: Configurable didactic core, Course ID: 8c614264-41a5-448f-a207-c2a6bc6ed5df
  - Course Title: Reactive real-time superstructure, Course ID: 31762771-9880-49ad-95d7-6280e016c4c8


Recommendations for Employee: Erik Owens (ID: 10411182-6a30-41fb-b24c-1242a632057e)
  - Course Title: Vision-oriented regional Graphic Interface, Course ID: a7e95994-0a4e-45ca-b383-99d5be874b27
  - Course Title: Configurable didactic core, Course ID: 8c614264-41a5-448f-a207-c2a6bc6ed5df
  - Course Title: Reactive real-time superstructure, Course ID: 31762771-9880-49ad-95d7-6280e016c4c8


Recommendations for Employee: Amanda Hayden (ID: 16eda4ba-42cd-4b9f-9db9-3a0e0285cab5)
  - Course Title: Vision-oriented regional Graphic Interface, Course ID: a7e95994-0a4e-45ca-b383-99d5be874b27
  - Course Title: Synergized coheren