In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

In [3]:
courses = pd.read_csv("../data/courses.csv")

In [5]:
courses['course_id'] = np.arange(1, len(courses) + 1)

In [6]:
courses.tail()

Unnamed: 0,Title,Instructor,Keywords,Learn,Description,course_id
451,Azure Data scientist Associate DP100,,Azure DP100 Certification,"Learn basics of Python programming language, H...","Through this course, candidates for the Azure ...",452
452,Digital Marketing,Ankur Khanna,"Digital Marketing course, Digital Marketing Ce...",Basics of Digital Marketing and Business Land...,A digital marketing course is a comprehensive ...,453
453,GIT,Sourangshu Pal,"best course on GIT, Git course online, best co...","Git Introduction, Git Commands, Git Branching,...",The Git course will teach you how to utilise t...,454
454,Linux,Sourangshu Pal,"linux administration course, linux course fees...","Linux Introduction, Setting up Our Linux Space...",This Linux course looks at the tools and techn...,455
455,R Programming,Shlok Pandey,"r programming course, best r programming cours...","Introduction to basics of R, R Matrices and Ar...",R is a programming language used for statistic...,456


In [7]:
courses.isna().sum()

Title            0
Instructor     134
Keywords         3
Learn            6
Description      0
course_id        0
dtype: int64

In [8]:
courses['Instructor']=courses['Instructor'].fillna(' ')
courses['Keywords']=courses['Keywords'].fillna(' ')
courses['Learn']=courses['Learn'].fillna(' ')

In [9]:
courses.isna().sum()

Title          0
Instructor     0
Keywords       0
Learn          0
Description    0
course_id      0
dtype: int64

In [10]:
# Set random seed for reproducibility
np.random.seed(42)

# Parameters
num_users = 1000  # Number of unique users
num_courses = 456  # Number of unique courses
num_ratings = 10000  # Number of ratings to generate

# Generate random user IDs, course IDs, and ratings
user_ids = np.random.randint(1, num_users + 1, num_ratings)
course_ids = np.random.randint(1, num_courses + 1, num_ratings)
ratings = np.random.randint(1, 6, num_ratings)  # Ratings between 1 and 5

# Create the DataFrame
ratings_df = pd.DataFrame({
    'user_id': user_ids,
    'course_id': course_ids,
    'rating': ratings
})

# Remove duplicates: keep the first occurrence if duplicate exists
ratings_df = ratings_df.drop_duplicates(subset=['user_id', 'course_id'])

# Ensure the DataFrame has the required number of ratings
while len(ratings_df) < num_ratings:
    # Generate additional ratings
    additional_user_ids = np.random.randint(1, num_users + 1, num_ratings - len(ratings_df))
    additional_course_ids = np.random.randint(1, num_courses + 1, num_ratings - len(ratings_df))
    additional_ratings = np.random.randint(1, 6, num_ratings - len(ratings_df))

    # Create additional DataFrame and append
    additional_df = pd.DataFrame({
        'user_id': additional_user_ids,
        'course_id': additional_course_ids,
        'rating': additional_ratings
    })

    # Concatenate and remove duplicates again
    ratings_df = pd.concat([ratings_df, additional_df]).drop_duplicates(subset=['user_id', 'course_id'])

# Display a sample of the generated DataFrame
print(ratings_df.head())
print(f"Total unique ratings: {len(ratings_df)}")

   user_id  course_id  rating
0      103        442       3
1      436        279       2
2      861        251       5
3      271        310       4
4      107        208       3
Total unique ratings: 10000


In [11]:
# Set random seed for reproducibility
np.random.seed(42)

# Parameters
num_users = 1000  # Number of unique users

# Define IT roles and goals
roles = [
    'Data Scientist', 'Software Engineer', 'AI Specialist', 
    'Machine Learning Engineer', 'Data Analyst', 'DevOps Engineer', 
    'Cybersecurity Analyst', 'Database Administrator', 
    'Cloud Engineer', 'Business Intelligence Analyst'
]
goals = [
    'Learn ML', 'Improve Python', 'Deepen AI knowledge', 
    'Master SQL', 'Enhance Data Visualization', 'Boost Cloud Skills', 
    'Strengthen Cybersecurity', 'Optimize Databases', 
    'Advance in DevOps', 'Explore BI Tools'
]

# Generate random user data
user_ids = np.arange(1, num_users + 1)
user_roles = np.random.choice(roles, num_users)
user_goals = np.random.choice(goals, num_users)

# Create DataFrame
users = pd.DataFrame({
    'user_id': user_ids,
    'role': user_roles,
    'goal': user_goals
})

# Display a sample of the generated DataFrame
users.head()

Unnamed: 0,user_id,role,goal
0,1,Cybersecurity Analyst,Learn ML
1,2,Machine Learning Engineer,Optimize Databases
2,3,Database Administrator,Master SQL
3,4,Data Analyst,Master SQL
4,5,Cybersecurity Analyst,Enhance Data Visualization


In [12]:
# Aggregating interaction data
user_course_count = ratings_df.groupby('user_id').size().to_frame('course_count')
user_enrollments = ratings_df.groupby('user_id')['course_id'].nunique().to_frame('unique_courses')
user_avg_rating = ratings_df.groupby('user_id')['rating'].mean().to_frame('avg_rating')
user_total_interactions = ratings_df.groupby('user_id')['rating'].sum().to_frame('total_interactions')

In [13]:
# Update feature store with roles and goals
user_feature_store = users.set_index('user_id').join([user_course_count, user_enrollments, user_avg_rating, user_total_interactions])
user_feature_store = user_feature_store.fillna(0)

In [14]:
print("Updated User Feature Store:")
user_feature_store

Updated User Feature Store:


Unnamed: 0_level_0,role,goal,course_count,unique_courses,avg_rating,total_interactions
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Cybersecurity Analyst,Learn ML,16,16,3.625000,58
2,Machine Learning Engineer,Optimize Databases,9,9,3.000000,27
3,Database Administrator,Master SQL,9,9,3.333333,30
4,Data Analyst,Master SQL,5,5,2.600000,13
5,Cybersecurity Analyst,Enhance Data Visualization,8,8,2.250000,18
...,...,...,...,...,...,...
996,Business Intelligence Analyst,Deepen AI knowledge,7,7,2.857143,20
997,Business Intelligence Analyst,Strengthen Cybersecurity,10,10,3.400000,34
998,Database Administrator,Enhance Data Visualization,7,7,3.142857,22
999,Software Engineer,Improve Python,9,9,3.111111,28


In [15]:
# Combine the course title and description into a single text field
courses['combined_text'] = courses['Title'] + " " + courses['Description']+ " " + courses['Instructor'] + " " + courses['Learn'] + " " + courses['Keywords']

# Vectorize the course descriptions and user context using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(courses['combined_text'])

In [31]:
def match_courses_with_context(user_id, top_n=3):
    """
    Match courses based on user context using TF-IDF and cosine similarity.

    Parameters:
    - user_id (int): ID of the user for whom recommendations are generated.
    - top_n (int): Number of top recommendations to return.

    Returns:
    - List of course IDs (int) representing the top context-based recommendations.
    """
    try:
        # Create a user context string by combining role and goal
        user_context_str = users.loc[user_id, 'role'] + " " + users.loc[user_id, 'goal']
        
        # Transform the context string into a TF-IDF vector
        user_context_vector = tfidf_vectorizer.transform([user_context_str])
        
        # Compute cosine similarities between user context and course descriptions
        cosine_similarities = cosine_similarity(user_context_vector, tfidf_matrix).flatten()
        
        # Find the top-N most similar courses
        top_n_indices = cosine_similarities.argsort()[-top_n:][::-1]
        context_recommendations = courses.iloc[top_n_indices]
        
        # Return the course IDs for the top recommendations
        return context_recommendations['course_id'].tolist()
    
    except KeyError as e:
        print(f"Error: {e}")
        return []

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [32]:
def item_item_recommendations(user_id, top_n=2):
    course_ratings = ratings.pivot(index='user_id', columns='course_id', values='rating').fillna(0)
    item_similarity = cosine_similarity(course_ratings.T)
    user_courses = ratings[ratings['user_id'] == user_id]['course_id'].tolist()
    
    similar_items = []
    for course in user_courses:
        similar_items.extend(item_similarity[course-101].argsort()[::-1][:top_n])
    
    return list(set(similar_items))[:top_n]

In [33]:
# Create the user-item interaction matrix
user_item_matrix = ratings_df.pivot(index='user_id', columns='course_id', values='rating').fillna(0)

In [34]:
# Function for User-User Collaborative Filtering
def user_user_recommendations(user_id, top_n=3):
    user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)
    user_similarity = cosine_similarity(user_vector, user_item_matrix)[0]
    
    similar_users = user_similarity.argsort()[-top_n-1:-1][::-1]
    recommended_items = user_item_matrix.iloc[similar_users].mean(axis=0).sort_values(ascending=False)
    
    recommended_courses = recommended_items.index[recommended_items > 0].tolist()
    return recommended_courses[:top_n]

In [35]:
# Function for Item-Item Collaborative Filtering
def item_item_recommendations(user_id, top_n=3):
    user_ratings = user_item_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0].index.tolist()
    
    item_similarity = cosine_similarity(user_item_matrix.T)
    item_recommendations = item_similarity.dot(user_ratings).flatten()
    
    recommended_items = pd.Series(item_recommendations, index=user_item_matrix.columns).sort_values(ascending=False)
    recommended_courses = recommended_items.index[~recommended_items.index.isin(rated_items)].tolist()
    return recommended_courses[:top_n]

In [36]:
# Function for Matrix Factorization using SVD
def svd_recommendations(user_id, top_n=3):
    U, sigma, Vt = svds(user_item_matrix, k=2)
    sigma = np.diag(sigma)
    user_factors = np.dot(np.dot(U, sigma), Vt)
    
    user_ratings = user_factors[user_id - 1]
    recommended_items = pd.Series(user_ratings, index=user_item_matrix.columns).sort_values(ascending=False)
    
    recommended_courses = recommended_items.index[recommended_items > 0].tolist()
    return recommended_courses[:top_n]

In [38]:
# Combine all methods into a hybrid recommendation system
def hybrid_recommendations_with_context_and_content(user_id, top_n=3):
    user_user_recs = user_user_recommendations(user_id, top_n)
    item_item_recs = item_item_recommendations(user_id, top_n)
    svd_recs = svd_recommendations(user_id, top_n)
    context_recs = match_courses_with_context(user_id, top_n)
    
    combined_recs = list(set(user_user_recs + item_item_recs + svd_recs + context_recs))
    recommended_courses = courses[courses['course_id'].isin(combined_recs)].head(top_n)
    
    return recommended_courses

# Test the hybrid system
user_id = 1
recommended_courses = hybrid_recommendations_with_context_and_content(user_id=user_id, top_n=10)
print(f"Hybrid Recommendations with Context and Content for User {user_id}:")
recommended_courses[['course_id', 'Title', 'Description']]

Hybrid Recommendations with Context and Content for User 1:


Unnamed: 0,course_id,Title,Description
16,17,Sensor Fault Prediction,The Air Pressure System (APS) is a critical co...
17,18,Text Summarization,This is an advanced NLP project where we take ...
18,19,The Ultimate Guide To OpenAI GPT-3 & Fine Tune...,Generative Pre-trained Transformer 3 (GPT-3; s...
29,30,Machine Learning Bootcamp Tech Neuron,In this Machine Learning Bootcamp you will lea...
31,32,Machine Learning Bootcamp,In this Machine Learning Bootcamp you will lea...
40,41,Pre Ethical Hacking Community Class,Worried about getting into Cyber Security Care...
44,45,C Sharp Programming,Learn the fundamentals of C# programming.
46,47,Complete iOS 16 Developer with Swift and 8 Apps,Learn iOS development with SwiftUI and buildin...
67,68,Pro Aptitude - Data Structures and Algorithms,This course is designed mostly for Data struct...
74,75,Pro Max Interview Preparation Edition 1,Pro Max Edition 1. These are interview prepara...


In [54]:
user_feature_store[user_feature_store.index==5]

Unnamed: 0_level_0,role,goal,course_count,unique_courses,avg_rating,total_interactions
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,Cybersecurity Analyst,Enhance Data Visualization,8,8,2.25,18


In [53]:
def hybrid_recommendations_with_context_and_content(user_id, top_n=3, weights=None):
    """
    Generate hybrid course recommendations based on multiple methods with weighting.

    Parameters:
    - user_id (int): ID of the user to generate recommendations for.
    - top_n (int): Number of top recommendations to return.
    - weights (dict): Optional weights for different recommendation methods. 
                      Expected keys are 'user_user', 'item_item', 'svd', 'context'.

    Returns:
    - pd.DataFrame: DataFrame containing the top recommended courses.
    """
    if weights is None:
        # Default equal weights if none provided
        weights = {
            'user_user': 0.15,
            'item_item': 0.15,
            'svd': 0.3,
            'context': 0.4
        }

    # Generate recommendations from each method
    user_user_recs = user_user_recommendations(user_id, top_n)
    item_item_recs = item_item_recommendations(user_id, top_n)
    svd_recs = svd_recommendations(user_id, top_n)
    context_recs = match_courses_with_context(user_id, top_n)
    
    # Create a dictionary to hold scores for each course
    course_scores = {}

    # Function to update the score dictionary
    def update_scores(recommendations, weight):
        for course in recommendations:
            if course in course_scores:
                course_scores[course] += weight
            else:
                course_scores[course] = weight
    
    # Update scores based on each method's recommendations and corresponding weight
    update_scores(user_user_recs, weights.get('user_user', 1.0))
    update_scores(item_item_recs, weights.get('item_item', 1.0))
    update_scores(svd_recs, weights.get('svd', 1.0))
    update_scores(context_recs, weights.get('context', 1.0))
    
    # Sort the courses by their scores in descending order
    sorted_courses = sorted(course_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Extract the top `top_n` courses
    top_course_ids = [course_id for course_id, score in sorted_courses[:top_n]]
    
    # Get the course details for the top recommendations
    recommended_courses = courses[courses['course_id'].isin(top_course_ids)]
    
    return recommended_courses

# Test the hybrid system
user_id = 5
recommended_courses = hybrid_recommendations_with_context_and_content(user_id=user_id, top_n=10)
print(f"Hybrid Recommendations with Context and Content for User {user_id}:")
recommended_courses[['course_id', 'Title', 'Description']]

Hybrid Recommendations with Context and Content for User 5:


Unnamed: 0,course_id,Title,Description
3,4,Mastering Databases,"This course is an introduction to databases, t..."
189,190,Data Analytics Bootcamp,Data analytics is a field that combines inform...
201,202,Power BI Foundations,Power BI is a luxury tool in the hands of busi...
352,353,Business Analytics Crash Course,Learn the power of using powerful visualizatio...
390,391,Business Analytics,Learn the power of using powerful visualizatio...
404,405,Business Analytics Masters with 3 month Intern...,Learn the power of using powerful visualizatio...
413,414,Stats for Beginners,If the goal of your career as a Data Scientist...
420,421,Tableau Foundation,Tableau is a powerful and fastest growing data...
421,422,PowerBI Foundation,Microsoft Power BI is a data and analytics rep...
450,451,Mern stack job preparation,This section is a Step by step guide to common...
