#### Loading of Datasets

In [None]:
### To reproduce, make our submission folder a shortcut in your drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!sudo rm -rf /usr/local/lib/python3.8/dist-packages/OpenSSL
!sudo rm -rf /usr/local/lib/python3.8/dist-packages/pyOpenSSL-22.1.0.dist-info/

!wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh
!chmod +x Miniconda3-py39_23.5.2-0-Linux-x86_64.sh

!bash ./Miniconda3-py39_23.5.2-0-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages/')
!pip3 install pyOpenSSL==22.0.0

# Installing the recommenders library.
# Ensure that you have python version <=3.9 when installing this.
!pip install recommenders

--2024-04-16 15:40:28--  https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.191.158, 104.16.32.241, 2606:4700::6810:20f1, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.191.158|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93409434 (89M) [application/x-sh]
Saving to: ‘Miniconda3-py39_23.5.2-0-Linux-x86_64.sh.2’


2024-04-16 15:40:28 (164 MB/s) - ‘Miniconda3-py39_23.5.2-0-Linux-x86_64.sh.2’ saved [93409434/93409434]

PREFIX=/usr/local
Unpacking payload ...

Installing base environment...


Downloading and Extracting Packages

Preparing transaction: - done
Executing transaction: | done
installation finished.
    You currently have a PYTHONPATH environment variable set. This may cause
    unexpected behavior when running the Python interpreter in Miniconda3.
    For best results, please verify that your PYTHONPATH only points to
    directories of packages th

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('/content/drive/MyDrive/BT4222_Group_3_Submission/cleaned_datasets/train.csv')
test = pd.read_csv('/content/drive/MyDrive/BT4222_Group_3_Submission/cleaned_datasets/test.csv')
df = pd.read_csv('/content/drive/MyDrive/BT4222_Group_3_Submission/cleaned_datasets/final_users_courses.csv')

#Create Normalised Demeaned Rating
max_abs_value_courses_train = train['Demeaned Rating'].abs().max()
train['Normalised Demeaned Rating'] = train['Demeaned Rating'] / max_abs_value_courses_train
max_abs_value_courses_test = test['Demeaned Rating'].abs().max()
test['Normalised Demeaned Rating'] = test['Demeaned Rating'] / max_abs_value_courses_test

#Select the columns (Reviewer, Course Name, Normalised Demeaned Rating, Date)
train = train[['Reviewer', 'Course Name', 'Normalised Demeaned Rating', 'Date']]
test = test[['Reviewer', 'Course Name', 'Normalised Demeaned Rating', 'Date']]

#Convert Date to Timestamp
train['Timestamp'] = pd.to_datetime(train['Date']).apply(lambda x: x.timestamp())
test['Timestamp'] = pd.to_datetime(test['Date']).apply(lambda x: x.timestamp())

# Create a dictionary to map each unique reviewer to a unique numerical ID
unique_reviewers = df['Reviewer'].unique()
reviewer_to_id = {reviewer: i for i, reviewer in enumerate(unique_reviewers, start=1)}

# Create a new column 'userId' and map the numerical IDs to the reviewers
train['userId'] = train['Reviewer'].map(reviewer_to_id)
test['userId'] = test['Reviewer'].map(reviewer_to_id)

# Create a dictionary to map each unique reviewer to a unique numerical ID
unique_courses = df['Course Name'].unique()
course_name_to_id = {course: i for i,course in enumerate(unique_courses, start=1)}

# Create a new column 'userId' and map the numerical IDs to the reviewers
train['itemId'] = train['Course Name'].map(course_name_to_id)
test['itemId'] = test['Course Name'].map(course_name_to_id)

#Duplicate df
train_working_copy = train.copy()
test_working_copy = test.copy()

#Drop Date and Reviewer column
train_working_copy = train_working_copy.drop(columns=['Date', 'Reviewer'])
test_working_copy = test_working_copy.drop(columns=['Date', 'Reviewer'])

#Shift columns
train_working_copy = train_working_copy[['userId', 'itemId', 'Timestamp', 'Normalised Demeaned Rating', 'Course Name']]
test_working_copy = test_working_copy[['userId', 'itemId', 'Timestamp', 'Normalised Demeaned Rating', 'Course Name']]

#Rename columns
train_working_copy.columns = ['userID', 'itemID', 'timestamp', 'rating', 'Course Name']
test_working_copy.columns = ['userID', 'itemID', 'timestamp', 'rating', 'Course Name']

train_working_copy.head()
test_working_copy.head()


Unnamed: 0,userID,itemID,timestamp,rating,Course Name
0,20574,150,1591834000.0,0.0,sequence models
1,19220,132,1585872000.0,0.0,improving deep neural networks: hyperparameter...
2,17175,194,1572307000.0,0.0,natural language processing in tensorflow
3,17176,159,1586736000.0,0.088154,what is data science?
4,21654,140,1597190000.0,0.0,excel skills for business: essentials


#### Initialise Parameter (TOP_K)

In [None]:
# Top k items to recommend
TOP_K = 10

#### SAR Model (Jaccard Similarity)

In [None]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, serendipity, diversity, catalog_coverage, distributional_coverage, novelty
from recommenders.models.sar import SAR

header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_prediction": "prediction",
}

# Extract unique users and items from the training set
train_users = set(train_working_copy['userID'])
train_items = set(train_working_copy['itemID'])

# Extract unique users and items from the test set
test_users = set(test_working_copy['userID'])
test_items = set(test_working_copy['itemID'])

# Check for overlap between training and test sets
user_overlap = train_users.intersection(test_users)
item_overlap = train_items.intersection(test_items)

if user_overlap or item_overlap:
    print("Warning: There is overlap between the training and test sets.")
    print("Number of overlapping users:", len(user_overlap))
    print("Number of overlapping items:", len(item_overlap))
else:
    print("The test set is entirely different from the training set.")

Number of overlapping users: 22575
Number of overlapping items: 233


In [None]:
# Instantiating the model using the Jaccard similarity method
model = SAR(
    similarity_type="jaccard",
    time_decay_coefficient=30,
    time_now=None,
    timedecay_formula=True,
    **header
)

In [None]:
# Fitting the model on the training data and computing the matrices
train_working_copy = train_working_copy.drop_duplicates()
model.fit(train_working_copy)

model.item_similarity = model.item_similarity.item().tocsr().toarray()

In [None]:
# Predicting top k items for every user.
# We are not recommending items that have been rated by the user.
test_working_copy = test_working_copy.drop_duplicates()
top_k = model.recommend_k_items(test_working_copy, top_k=10, remove_seen=True)

test_working_copy

  return self._with_data(self.data * other)


Unnamed: 0,userID,itemID,timestamp,rating,Course Name
0,20574,150,1.591834e+09,0.000000,sequence models
1,19220,132,1.585872e+09,0.000000,improving deep neural networks: hyperparameter...
2,17175,194,1.572307e+09,0.000000,natural language processing in tensorflow
3,17176,159,1.586736e+09,0.088154,what is data science?
4,21654,140,1.597190e+09,0.000000,excel skills for business: essentials
...,...,...,...,...,...
26021,12534,141,1.529107e+09,0.000000,excel skills for business: intermediate i
26022,1886,112,1.597795e+09,0.066116,indigenous canada
26023,6311,154,1.532563e+09,0.000000,mathematics for machine learning: linear algebra
26024,15161,161,1.554250e+09,0.000000,data analysis with python


#### Establishing User-Course Matrix (Cell Values of Prediction Scores)

In [None]:
#Find missing userID
missing_userID = list(set(test_working_copy['userID']) - set(top_k['userID']))
missing_userID

#Find missing itemID
missing_itemID = list(set(test_working_copy['itemID']) - set(top_k['itemID']))
missing_itemID

# Obtain user-course matrix (With values of prediction)
user_course_matrix = top_k.pivot(index='userID', columns='itemID', values='prediction')

#For each missing userID, add a row
for userID in missing_userID:
    user_course_matrix.loc[userID] = 0

#For each missing itemID, add a column
for itemID in missing_itemID:
    user_course_matrix[itemID] = 0

#Fill NaN values with 0
user_course_matrix.fillna(0, inplace=True)

#Fill inf values with 0
user_course_matrix = user_course_matrix.replace([np.inf, -np.inf], 0)

# Replace itemID and userID with actual course names and reviewer
user_course_matrix.columns = [unique_courses[i-1] for i in user_course_matrix.columns]
user_course_matrix.index = [unique_reviewers[i-1] for i in user_course_matrix.index]

def shift_and_scale_row_wise(df):
    min_vals = df.min(axis=1)  # Minimum value for each row
    max_vals = df.max(axis=1)  # Maximum value for each row

    # Check if both min and max are zero
    zero_mask = (min_vals == 0) & (max_vals == 0)

    # Shift and scale only for rows where min and max are not both zero
    shifted_df = df.where(~zero_mask, other=df)  # Keep the row unchanged if min and max are both zero
    shifted_df = shifted_df.sub(min_vals, axis=0)  # Subtract minimum value from each row
    range_vals = max_vals - min_vals
    range_vals[range_vals == 0] = 1  # Avoid division by zero
    scaled_df = shifted_df.div(range_vals, axis=0)  # Divide each row by the range (max-min)
    return scaled_df

# Shift and scale the values of the user-course matrix
user_course_matrix = shift_and_scale_row_wise(user_course_matrix)

user_course_matrix

#Export to CSV (Uncomment to export)
#user_course_matrix.to_csv('scoring_matrix_item_based_cf.csv')


Unnamed: 0,child nutrition and cooking,budgeting and scheduling projects,successful negotiation: essential strategies and skills,initiating and planning projects,chinese for beginners,terrorism and counterterrorism: comparing theory and practice,dog emotion and cognition,introductory human physiology,introduction to negotiation: a strategic playbook for becoming a principled and persuasive negotiator,management of fashion and luxury companies,...,excel fundamentals for data analysis,natural language processing with probabilistic models,web application technologies and django,natural language processing with sequence models,natural language processing with attention models,fundamentals of finance,fundamentals of machine learning for healthcare,social work practice: advocating social justice and change,how to manage a remote team,write your first novel
lorenia a,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vijayakumar m n,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
swati s,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
camila g,0.917856,0.0,0.0,0.598679,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
monica d,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
md s i,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sumit p,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
christine m,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
juan s,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Argument Definition for Metrics Calculations

In [None]:
# Defining arguments for calculating metrics. All ranking based metrics have the same arguments
args = [test_working_copy, top_k]
kwargs = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=TOP_K,
)

#### Metrics Evaluation

In [None]:
#MAP, NDCG, Precision, Recall Metrics
eval_map = map_at_k(*args, **kwargs)
print(f"MAP: {eval_map}")

eval_ndcg = ndcg_at_k(*args, **kwargs)
print(f"NDCG: {eval_ndcg}")

eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)
print(f"Precision: {eval_precision} \nRecall: {eval_recall}")

MAP: 0.04373333718380118
NDCG: 0.0639658071428001
Precision: 0.014393435351519183 
Recall: 0.12261216092181614


####Sample Recommendations

In [None]:
sample_user = "shreya v"

# Get top 10 recommendations for each user
top_n_recs = user_course_matrix.apply(lambda x: list(x.nlargest(10).index), axis=1)

# Get the top 10 recommendations for the sample user
sample_user_recs = top_n_recs[sample_user]
print(f"Top 10 Recommendations for {sample_user}:")

for recc in  sample_user_recs[:5]:
    print(recc)

Top 5 Recommendations for shreya v:
child nutrition and cooking
budgeting and scheduling projects
successful negotiation: essential strategies and skills
initiating and planning projects
chinese for beginners
