# User Based Collaborative Filtering

We will conduct user based collaborative filtering based on user-course interactions. These interactions are the sentiment of their reviews, and the ratings given to the courses.

# Read data

### To reproduce, make our submission folder a shortcut in your drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/BT4222_Group_3_Submission

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('cleaned_datasets/train.csv')
all_df = pd.read_csv('cleaned_datasets/final_users_courses.csv')

#Create Normalised Demeaned Rating
max_abs_value_courses = df['Demeaned Rating'].abs().max()
df['Normalised Demeaned Rating'] = df['Demeaned Rating'] / max_abs_value_courses

# Sentiment Score using NLTK

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Apply the sentiment analyzer to the "reviews" column
df['sentiment_score'] = df['Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

[nltk_data] Downloading package vader_lexicon to C:\Users\Quang
[nltk_data]     Anh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# Create a user-course interaction matrix (Train)
grouped_data_train = df.groupby('Reviewer')
user_course_interaction = {}

for user, user_data in grouped_data_train:
    interactions = {}
    for _, row in user_data.iterrows():
        course_name = row['Course Name']
        rating = row['Normalised Demeaned Rating']
        sentiment_score = row['sentiment_score']
        interactions[course_name] = 0.4*rating + 0.6*sentiment_score
    user_course_interaction[user] = interactions

user_course_matrix = pd.DataFrame.from_dict(user_course_interaction, orient="index")
user_course_matrix.fillna(0, inplace=True)

user_course_matrix

Unnamed: 0,neural networks and deep learning,"improving deep neural networks: hyperparameter tuning, regularization and optimization",sequence models,what is data science?,programming for everybody getting started with python,supply chain principles,using python to interact with the operating system,structuring machine learning projects,the bits and bytes of computer networking,introduction to programming with matlab,...,write a feature length screenplay for film or television,writing and editing: word choice and word order,health behavior change: from evidence to action,community organizing for social justice,dentistry 101,mastering final cut pro,natural language processing with attention models,getting started with azure,web application technologies and django,high stakes leadership: leading in times of crisis
- a u s,0.584880,0.440700,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
.,0.501420,0.000000,0.56148,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
18-4n3 s k,0.594630,0.000000,0.00000,0.0,0.0,0.0,-0.047611,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
18it042 c j,0.560940,0.493500,0.00000,0.0,0.0,0.0,0.000000,0.366900,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
aadesh n,0.456479,0.322559,0.00000,0.0,0.0,0.0,0.000000,0.427321,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
melisa z,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
amanpreet k,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
colin h,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.625982,0.0,0.0,0.0,0.0,0.00000,0.0,0.00000,0.0,0.0
vallabhaneni s,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.26424,0.0,0.00000,0.0,0.0


## Obtain final user similarity matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the similarity between users
user_similarity = cosine_similarity(user_course_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_course_matrix.index, columns=user_course_matrix.index)

user_similarity_df

Unnamed: 0,- a u s,.,18-4n3 s k,18it042 c j,aadesh n,aakash a,aakash b,aakash g,aakash m,aakash s,...,jessica c p,lzaro v c j,silvana m c,allison h,carolyn m,melisa z,amanpreet k,colin h,vallabhaneni s,pragathi
- a u s,1.000000,0.531979,0.796113,0.895029,0.691150,0.299847,0.463747,0.559138,0.549008,0.315055,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.531979,1.000000,0.663964,0.448891,0.376150,0.250074,0.386768,0.248363,0.457876,0.262758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18-4n3 s k,0.796113,0.663964,1.000000,0.671771,0.562913,0.374239,0.578803,0.371679,0.685217,0.393221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18it042 c j,0.895029,0.448891,0.671771,1.000000,0.850187,0.490037,0.391316,0.508768,0.463260,0.265848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aadesh n,0.691150,0.376150,0.562913,0.850187,1.000000,0.496272,0.327904,0.511676,0.388190,0.222768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
melisa z,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
amanpreet k,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
colin h,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
vallabhaneni s,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Scoring matrix

In [None]:
def predict_interactions(user_id, num_similar_users):
    # Get the most similar users and their similarity scores
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[:num_similar_users]

    # Get the interactions of the similar users
    similar_users_interactions = user_course_matrix.loc[similar_users.index]

    # Calculate the weights for the weighted average
    weights = similar_users / similar_users.sum()

    # Calculate the weighted average of the similar users' interactions
    predicted_interactions = (similar_users_interactions.T * weights).T.sum(axis=0)

    return predicted_interactions

# Create scoring matrix
scoring_matrix = pd.DataFrame(index=user_course_matrix.index, columns=user_course_matrix.columns)
for user_id in scoring_matrix.index:
    scoring_matrix.loc[user_id] = predict_interactions(user_id, 10)


def shift_and_scale_row_wise(df):
    min_vals = df.min(axis=1)  # Minimum value for each row
    max_vals = df.max(axis=1)  # Maximum value for each row

    # Check if both min and max are zero
    zero_mask = (min_vals == 0) & (max_vals == 0)

    # Shift and scale only for rows where min and max are not both zero
    shifted_df = df.where(~zero_mask, other=df)  # Keep the row unchanged if min and max are both zero
    shifted_df = shifted_df.sub(min_vals, axis=0)  # Subtract minimum value from each row
    range_vals = max_vals - min_vals
    range_vals[range_vals == 0] = 1  # Avoid division by zero
    scaled_df = shifted_df.div(range_vals, axis=0)  # Divide each row by the range (max-min)
    return scaled_df

# Shift and scale the values of the user-course matrix
scoring_matrix = shift_and_scale_row_wise(scoring_matrix)

scoring_matrix

Unnamed: 0,neural networks and deep learning,"improving deep neural networks: hyperparameter tuning, regularization and optimization",sequence models,what is data science?,programming for everybody getting started with python,supply chain principles,using python to interact with the operating system,structuring machine learning projects,the bits and bytes of computer networking,introduction to programming with matlab,...,write a feature length screenplay for film or television,writing and editing: word choice and word order,health behavior change: from evidence to action,community organizing for social justice,dentistry 101,mastering final cut pro,natural language processing with attention models,getting started with azure,web application technologies and django,high stakes leadership: leading in times of crisis
- a u s,1.0,0.753783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.923574,0.005043,1.0,0.005043,0.005043,0.005043,0.005043,0.0,0.005043,0.005043,...,0.005043,0.005043,0.005043,0.005043,0.005043,0.005043,0.005043,0.005043,0.005043,0.005043
18-4n3 s k,1.0,0.010048,0.010048,0.010048,0.010048,0.010048,0.0,0.010048,0.010048,0.010048,...,0.010048,0.010048,0.010048,0.010048,0.010048,0.010048,0.010048,0.010048,0.010048,0.010048
18it042 c j,1.0,0.904644,0.0,0.0,0.0,0.0,0.0,0.702825,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aadesh n,1.0,0.398863,0.0,0.0,0.0,0.0,0.0,0.846745,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
melisa z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amanpreet k,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,...,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162
colin h,0.0,0.0,0.124929,0.0,0.127383,0.0,0.0,0.0,0.109685,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vallabhaneni s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144786,0.169973,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
# Create a dictionary to map each unique reviewer to a unique numerical ID
unique_courses = all_df['Course Name'].unique()
course_name_to_id = {course: i for i,course in enumerate(unique_courses, start=1)}

# Get the column names from your current matrix
current_columns = set(scoring_matrix.columns)

# Get the course names from the dictionary keys
course_names_from_dict = set(course_name_to_id.keys())

# Find the missing column names
missing_column_names = course_names_from_dict - current_columns

# Convert the missing column names to a list
missing_column_names_list = list(missing_column_names)

#For each missing itemID, add a column
for course in missing_column_names_list:
    scoring_matrix[course] = 0

scoring_matrix

Unnamed: 0,neural networks and deep learning,"improving deep neural networks: hyperparameter tuning, regularization and optimization",sequence models,what is data science?,programming for everybody getting started with python,supply chain principles,using python to interact with the operating system,structuring machine learning projects,the bits and bytes of computer networking,introduction to programming with matlab,...,mastering final cut pro,natural language processing with attention models,getting started with azure,web application technologies and django,high stakes leadership: leading in times of crisis,how to manage a remote team,social work practice: advocating social justice and change,write your first novel,fundamentals of finance,fundamentals of machine learning for healthcare
- a u s,1.0,0.753783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
.,0.923574,0.005043,1.0,0.005043,0.005043,0.005043,0.005043,0.0,0.005043,0.005043,...,0.005043,0.005043,0.005043,0.005043,0.005043,0,0,0,0,0
18-4n3 s k,1.0,0.010048,0.010048,0.010048,0.010048,0.010048,0.0,0.010048,0.010048,0.010048,...,0.010048,0.010048,0.010048,0.010048,0.010048,0,0,0,0,0
18it042 c j,1.0,0.904644,0.0,0.0,0.0,0.0,0.0,0.702825,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
aadesh n,1.0,0.398863,0.0,0.0,0.0,0.0,0.0,0.846745,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
melisa z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
amanpreet k,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,...,0.087162,0.087162,0.087162,0.087162,0.087162,0,0,0,0,0
colin h,0.0,0.0,0.124929,0.0,0.127383,0.0,0.0,0.0,0.109685,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
vallabhaneni s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144786,0.169973,0.0,...,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [None]:
# Set probabilities of courses taken already by users to 0

courses_taken = df.groupby('Reviewer')['Course Name'].agg(list)
user_courses_map = courses_taken.to_dict()

for reviewer, courses in user_courses_map.items():
    if reviewer in scoring_matrix.index:
        for course in courses:
            if course in scoring_matrix.columns:
                scoring_matrix.at[reviewer, course] = 0

# Evaluation

In [None]:
test_df = pd.read_csv('cleaned_datasets/test.csv', index_col=0)

In [None]:
scoring_matrix

Unnamed: 0,neural networks and deep learning,"improving deep neural networks: hyperparameter tuning, regularization and optimization",sequence models,what is data science?,programming for everybody getting started with python,supply chain principles,using python to interact with the operating system,structuring machine learning projects,the bits and bytes of computer networking,introduction to programming with matlab,...,mastering final cut pro,natural language processing with attention models,getting started with azure,web application technologies and django,high stakes leadership: leading in times of crisis,how to manage a remote team,social work practice: advocating social justice and change,write your first novel,fundamentals of finance,fundamentals of machine learning for healthcare
- a u s,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
.,0,0.005043,0,0.005043,0.005043,0.005043,0.005043,0.0,0.005043,0.005043,...,0.005043,0.005043,0.005043,0.005043,0.005043,0,0,0,0,0
18-4n3 s k,0,0.010048,0.010048,0.010048,0.010048,0.010048,0,0.010048,0.010048,0.010048,...,0.010048,0.010048,0.010048,0.010048,0.010048,0,0,0,0,0
18it042 c j,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
aadesh n,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
melisa z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
amanpreet k,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,0.087162,...,0.087162,0.087162,0.087162,0.087162,0.087162,0,0,0,0,0
colin h,0.0,0.0,0.124929,0.0,0.127383,0.0,0.0,0.0,0.109685,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
vallabhaneni s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144786,0.169973,0.0,...,0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [None]:
scoring_matrix = scoring_matrix.astype(float)

In [None]:
# Get top 10 recommendations for each user
top_n_recs = scoring_matrix.apply(lambda x: list(x.nlargest(10).index), axis=1)

In [None]:
# Group test_df by 'Reviewer' and aggregate the courses into a list
actual_courses = test_df.groupby('Reviewer')['Course Name'].agg(list)

In [None]:
def evaluate_recommendations(recommendations, actual):
    """
    Evaluate recommendations using precision and recall for named reviewers.

    :param recommendations: Pandas Series where index is reviewer names and values are lists of recommended courses.
    :param actual: Pandas Series where index is reviewer names and values are lists of actual courses.
    :returns: Tuple containing average precision and recall for all reviewers.
    """
    precision_list = []
    recall_list = []

    # Iterate over each reviewer's recommendations and actual courses
    for reviewer, recs in recommendations.items():
        acts = actual.get(reviewer, [])
        true_positives = set(recs) & set(acts)

        precision = len(true_positives) / len(recs) if recs else 0
        recall = len(true_positives) / len(acts) if acts else 0

        precision_list.append(precision)
        recall_list.append(recall)

    average_precision = sum(precision_list) / len(precision_list) if precision_list else 0
    average_recall = sum(recall_list) / len(recall_list) if recall_list else 0

    return average_precision, average_recall

In [None]:
precision, recall = evaluate_recommendations(top_n_recs, actual_courses)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the results
print(f"Precision: {round(precision, 4)}, Recall: {round(recall, 4)}, F1 Score: {round(f1_score, 4)}")

Precision: 0.0188, Recall: 0.1715, F1 Score: 0.034


### Export scoring matrix

In [None]:
scoring_matrix.to_csv('/score_matrices/scoring_matrix_ubcf.csv')

### Sample recommendation

In [None]:
sample_user = "shreya v"

# Get the top 5 recommendations for the sample user
sample_user_recs = top_n_recs[sample_user]
print(f"Course taken by {sample_user}:\n{user_courses_map[sample_user]}")
print(f"Top 5 Recommendations for {sample_user}:")

for recc in  sample_user_recs[:5]:
    print(recc)

Course taken by shreya v:
['introduction to machine learning']
Top 5 Recommendations for shreya v:
ai for everyone
data science math skills
data science methodology
programming for everybody getting started with python
neural networks and deep learning
