#### Importing Libraries

In [1]:
### To reproduce, make our submission folder a shortcut in your drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!sudo rm -rf /usr/local/lib/python3.8/dist-packages/OpenSSL
!sudo rm -rf /usr/local/lib/python3.8/dist-packages/pyOpenSSL-22.1.0.dist-info/

!wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh
!chmod +x Miniconda3-py39_23.5.2-0-Linux-x86_64.sh

!bash ./Miniconda3-py39_23.5.2-0-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages/')
!pip3 install pyOpenSSL==22.0.0

# Installing the recommenders library.
# Ensure that you have python version <=3.9 when installing this.
!pip install recommenders

--2024-04-16 12:43:15--  https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.191.158, 104.16.32.241, 2606:4700::6810:bf9e, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.191.158|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93409434 (89M) [application/x-sh]
Saving to: ‘Miniconda3-py39_23.5.2-0-Linux-x86_64.sh.1’


2024-04-16 12:43:16 (162 MB/s) - ‘Miniconda3-py39_23.5.2-0-Linux-x86_64.sh.1’ saved [93409434/93409434]

PREFIX=/usr/local
Unpacking payload ...
                                                                                      
Installing base environment...


Downloading and Extracting Packages

Preparing transaction: - done
Executing transaction: | done
installation finished.
    You currently have a PYTHONPATH environment variable set. This may cause
    unexpected behavior when running the Python interpreter in Miniconda3.
    For best res

In [3]:
import sys
import os
import shutil

# Pandas and Numpy is used for efficient handling of arrays.
import pandas as pd
import numpy as np

!pip install recommenders
from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_chrono_split

# importing the dataset
from recommenders.datasets import movielens
from recommenders.models.ncf.dataset import Dataset as NCFDataset

# Importing the NCF model class from the recommenders library
from recommenders.models.ncf.ncf_singlenode import NCF

# importing the evaluation metrics
from recommenders.evaluation.python_evaluation import (map_at_k, ndcg_at_k, precision_at_k, recall_at_k)
from recommenders.utils.constants import SEED as DEFAULT_SEED


print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

[0mSystem version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
Pandas version: 2.0.3


#### Definition of Parameters

In [4]:
# top k items to recommend
TOP_K = 10

# Model parameters
# Number of iterations during the training process
EPOCHS = 25
# Batch size means how many user-item pairs you want to predict at once
BATCH_SIZE = 256

# Setting seed to remove any stochasticity and reproduce results
SEED = DEFAULT_SEED  # Set N

#### Loading Dataset

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/drive/MyDrive/BT4222_Group_3_Submission/cleaned_datasets/final_users_courses.csv')

#Create Normalised Demeaned Rating
max_abs_value_courses = df['Demeaned Rating'].abs().max()
df['Normalised Demeaned Rating'] = df['Demeaned Rating'] / max_abs_value_courses

#Select the columns (Reviewer, Course Name, Normalised Demeaned Rating, Date)
df = df[['Reviewer', 'Course Name', 'Normalised Demeaned Rating', 'Date']]

#Convert Date to Timestamp
df['Timestamp'] = pd.to_datetime(df['Date']).apply(lambda x: x.timestamp())

# Create a dictionary to map each unique reviewer to a unique numerical ID
unique_reviewers = df['Reviewer'].unique()
reviewer_to_id = {reviewer: i for i, reviewer in enumerate(unique_reviewers, start=1)}

# Create a new column 'userId' and map the numerical IDs to the reviewers
df['userId'] = df['Reviewer'].map(reviewer_to_id)

# Create a dictionary to map each unique reviewer to a unique numerical ID
unique_courses = df['Course Name'].unique()
course_name_to_id = {course: i for i,course in enumerate(unique_courses, start=1)}

# Create a new column 'userId' and map the numerical IDs to the reviewers
df['itemId'] = df['Course Name'].map(course_name_to_id)

#Duplicate df
df_working_copy = df.copy()

#Drop Date and Reviewer column
df_working_copy = df_working_copy.drop(columns=['Date', 'Reviewer'])

#Shift columns
df_working_copy = df_working_copy[['userId', 'itemId', 'Timestamp', 'Normalised Demeaned Rating', 'Course Name']]

#Rename columns
df_working_copy.columns = ['userID', 'itemID', 'timestamp', 'rating', 'Course Name']

df_working_copy.head()


Unnamed: 0,userID,itemID,timestamp,rating,Course Name
0,1,1,1438906000.0,0.0,child nutrition and cooking
1,2,2,1438906000.0,0.0,budgeting and scheduling projects
2,3,3,1438906000.0,0.045549,successful negotiation: essential strategies a...
3,2,4,1438906000.0,0.0,initiating and planning projects
4,4,5,1438992000.0,0.065476,chinese for beginners


#### Train-Test Split

In [7]:
# Splitting the dataset.
# 75% will be used during training and 25% will be used during testing

train, test = python_chrono_split(df_working_copy, 0.75)

In [8]:
# Filtering out users and items in the test set that do not appear in the training set.
# This is done so that we can see if our model has learnt user's previous item interactions and can recommend relevant items.

test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

# Creating a test set which only contains the last interaction for each user. Remaining data of the user is used in the train set
leave_one_out_test = test.groupby("userID").last().reset_index()

In [9]:
# Writing the data into csv files

train_file = "./train_ncf.csv"
test_file = "./test_ncf.csv"
leave_one_out_test_file = "./leave_one_out_test_ncf.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

#### Loading Train-Test Data

In [10]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)

100%|██████████| 22568/22568 [01:31<00:00, 246.62it/s]


#### Model Definition and Training

In [11]:
model = NCF (
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=8,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED,

)



In [12]:
# Fitting the model on the training data

with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

Took 13985.268113064 seconds for training.


In [13]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]


predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,1,110,4.335981e-06
1,2,88,2.432524e-07
2,3,91,0.08660305
3,3,12,0.01495921
4,3,147,0.9043579


#### Obtain All Predictions

In [17]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item)
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)


print("Took {} seconds for prediction.".format(test_time.interval))

Took 49.7440964180023 seconds for prediction.


#### Obtain User-Course Matrix (With Values of Prediction Scores)

In [18]:
# Obtain user-course matrix (With values of prediction)
user_course_matrix = all_predictions.pivot(index='userID', columns='itemID', values='prediction')

missing_values = set(range(1, 240)) - set(item)

# For each missing course, add a column to all_predictions
for missing_value in missing_values:
    user_course_matrix[missing_value] = 0

#Replace NaN with 0
user_course_matrix = user_course_matrix.fillna(0)

# Replace itemID and userID with actual course names and reviewer
user_course_matrix.columns = [unique_courses[i-1] for i in user_course_matrix.columns]
user_course_matrix.index = [unique_reviewers[i-1] for i in user_course_matrix.index]

min_scores = user_course_matrix.min(axis=1)
max_scores = user_course_matrix.max(axis=1)

# Apply the normalization function to each row of the DataFrame
user_course_matrix = user_course_matrix.apply(lambda x: (x - min_scores[x.name]) / (max_scores[x.name] - min_scores[x.name]) if (max_scores[x.name] - min_scores[x.name]) != 0 else 0, axis=1)

user_course_matrix

#Export the user_course_matrix to a csv file (Uncomment to export)
user_course_matrix.to_csv("scoring_matrix_ncf.csv")

#### Metrics Evaluation

In [19]:
#MAP
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"MAP: {eval_map}")

MAP: 0.050462707817119146


In [20]:
#NDCG
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"NDCG: {eval_ndcg}")

NDCG: 0.07364028538958929


In [21]:
#Precision
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"Precision: {eval_precision}")

Precision: 0.019948599787309465


In [22]:
#Recall
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
print(f"Recall: {eval_recall}")

Recall: 0.123044228086202


####Sample Recommendation

In [23]:
sample_user = "shreya v"

# Get top 10 recommendations for each user
top_n_recs = user_course_matrix.apply(lambda x: list(x.nlargest(10).index), axis=1)

# Get the top 10 recommendations for the sample user
sample_user_recs = top_n_recs[sample_user]
print(f"Top 5 Recommendations for {sample_user}:")

for recc in  sample_user_recs[:5]:
    print(recc)

Top 5 Recommendations for shreya v:
programming for everybody getting started with python
neural networks and deep learning
using databases with python
using python to access web data
algorithmic toolbox
