In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808328 sha256=7c62509063c5851929d5f39c02e559e886d15c7919b233df65880637a82e49bd
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [None]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.model_selection import ParameterGrid

In [None]:
train = pd.read_csv('/content/drive/My Drive/BT4222_Group_3_Submission/cleaned_datasets/train.csv')

### Create Item_Id

In [None]:
train['Item_id'] = train['Course Name']

In [None]:
print(train.columns)

Index(['Review', 'Reviewer', 'Individual Rating', 'Course Name', 'Institution',
       'Overall Ratings', 'Level', 'Duration', 'Num of Reviews',
       'Course Skills', 'Description', 'Date', 'Demeaned Rating', 'Popularity',
       'Item_id'],
      dtype='object')


### Get Unique Reviewers and Courses


In [None]:
train_users = train['Reviewer'].unique()
unique_items_train = train.drop_duplicates(subset=['Item_id'])
train_items = unique_items_train['Item_id'].unique()

### Create Item Features

In [None]:
item_features = set()
ignore_cols = ['Review', 'Reviewer', 'Individual Rating', 'Course Name', 'Institution', 'Date', 'Item_id']

for index, row in unique_items_train.iterrows():
    for col in unique_items_train.columns:
        if col not in ignore_cols:
            item_features.add(f"{col}:{row[col]}")

### Fit Dataset with Users, Items and Item Features

In [None]:
dataset = Dataset()
dataset.fit(
    users=train_users,
    items=train_items,
    item_features=item_features)

### Create Item Feature Matrix

In [None]:
item_features_list = []

for index, row in unique_items_train.iterrows():
    item_id = row['Item_id']
    features = [f"{col}:{row[col]}" for col in unique_items_train.columns if col not in ignore_cols]
    item_features_list.append((item_id, features))

item_features_matrix = dataset.build_item_features(item_features_list, normalize=True)

### Build User and Item Interactions

In [None]:
unique_data = train[['Reviewer', 'Item_id']].drop_duplicates()
(interactions, _) = dataset.build_interactions(unique_data.values)

### Split Interactions into Train, Eval and Test Interaction Sets

In [None]:
train_eval_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.1, random_state=42)
train_interactions, eval_interactions = random_train_test_split(train_eval_interactions, test_percentage=1/9, random_state=42)

### Check for Overlapping Interactions in Each Set

In [None]:
intersection = train_interactions.multiply(eval_interactions).nnz
print("Number of overlapping train-eval interactions:", intersection)

intersection = train_interactions.multiply(test_interactions).nnz
print("Number of overlapping train-test interactions:", intersection)

Number of overlapping train-eval interactions: 0
Number of overlapping train-test interactions: 0


In [None]:
print(train_interactions.shape)
print(eval_interactions.shape)
print(test_interactions.shape)

(22575, 234)
(22575, 234)
(22575, 234)


### Hyperparameter Tuning, Maximising the F1-Score

In [None]:
param_grid = {
    'no_components': [20, 40, 60],
    'learning_rate': [0.05, 0.1, 0.15],
    'learning_schedule': ['adagrad', 'adadelta']
}

def evaluate_model_f1(params):
    model = LightFM(**params)
    model.fit(train_interactions, epochs=20, verbose=True, item_features=item_features_matrix)
    precision = precision_at_k(model=model, test_interactions=eval_interactions, train_interactions=train_interactions, k=10, item_features=item_features_matrix).mean()
    recall = recall_at_k(model=model, test_interactions=eval_interactions, train_interactions=train_interactions, k=10, item_features=item_features_matrix).mean()
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [None]:
best_f1 = 0
best_params = {}
for params in ParameterGrid(param_grid):
    f1 = evaluate_model_f1(params)
    if f1 > best_f1:
        best_f1 = f1
        best_params = params

print('Best Parameters:', best_params)
print('Best F1:', best_f1)

Epoch: 100%|██████████| 20/20 [00:05<00:00,  3.84it/s]
Epoch: 100%|██████████| 20/20 [00:21<00:00,  1.07s/it]
Epoch: 100%|██████████| 20/20 [00:11<00:00,  1.70it/s]
Epoch: 100%|██████████| 20/20 [00:04<00:00,  4.34it/s]
Epoch: 100%|██████████| 20/20 [00:09<00:00,  2.07it/s]
Epoch: 100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
Epoch: 100%|██████████| 20/20 [00:04<00:00,  4.64it/s]
Epoch: 100%|██████████| 20/20 [00:05<00:00,  3.58it/s]
Epoch: 100%|██████████| 20/20 [00:09<00:00,  2.14it/s]
Epoch: 100%|██████████| 20/20 [00:04<00:00,  4.08it/s]
Epoch: 100%|██████████| 20/20 [00:09<00:00,  2.19it/s]
Epoch: 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]
Epoch: 100%|██████████| 20/20 [00:03<00:00,  5.73it/s]
Epoch: 100%|██████████| 20/20 [00:05<00:00,  3.63it/s]
Epoch: 100%|██████████| 20/20 [00:09<00:00,  2.14it/s]
Epoch: 100%|██████████| 20/20 [00:05<00:00,  3.58it/s]
Epoch: 100%|██████████| 20/20 [00:08<00:00,  2.32it/s]
Epoch: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]


Best Parameters: {'learning_rate': 0.05, 'learning_schedule': 'adagrad', 'no_components': 20}
Best F1: 0.044483410012403246


### Build Model with Best Params

In [None]:
model = LightFM(**best_params, loss='warp', random_state=42)

### Train Model using Train Interactions


In [None]:
model.fit(train_interactions,
            epochs=20,
            verbose=True,
            item_features=item_features_matrix)

Epoch: 100%|██████████| 20/20 [00:04<00:00,  4.07it/s]


<lightfm.lightfm.LightFM at 0x7f53921cae90>

### Test Evaluation Scores (Precision at K, Recall at K, F1-Score at K)

In [None]:
test_precision = precision_at_k(model=model, test_interactions=test_interactions, train_interactions=train_interactions, k=10, item_features=item_features_matrix).mean()
print(test_precision)

test_recall = recall_at_k(model=model, test_interactions=test_interactions, train_interactions=train_interactions, k=10, item_features=item_features_matrix).mean()
print(test_recall)

test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)
print(test_f1)

0.044041272
0.35447058100970397
0.07834816071868564


### Get Scoring Matrix and Specific Top K Recommendations

---



In [None]:
n_users, n_items = train_interactions.shape

# Predict scores for all user-item pairs
scoring_user_ids = np.concatenate([np.full((n_items, ), i) for i in range(n_users)])
scoring_item_ids = np.concatenate([np.arange(n_items) for _ in range(n_users)])
scores = model.predict(user_ids=scoring_user_ids, item_ids=scoring_item_ids)
scores_matrix = scores.reshape(n_users, n_items)

# Apply min-max normalization to scale the scores between 0 and 1
min_score = scores_matrix.min()
max_score = scores_matrix.max()
normalized_scores_matrix = (scores_matrix - min_score) / (max_score - min_score)

# print(normalized_scores_matrix)
# print(scores_matrix)

# Reverse mappings: from dataset indices back to original IDs
user_id_map, _, item_id_map, _ = dataset.mapping()
reverse_user_map = {v: k for k, v in user_id_map.items()}
reverse_item_map = {v: k for k, v in item_id_map.items()}

sorted_user_ids = [reverse_user_map[i] for i in range(n_users)]
sorted_item_ids = [reverse_item_map[j] for j in range(n_items)]

df_score = pd.DataFrame(data=normalized_scores_matrix, index=sorted_user_ids, columns=sorted_item_ids)

# Determine the top N recommendations for each user
N = 10
top_items_per_user = np.argsort(-normalized_scores_matrix, axis=1)[:, :N]

# Print out the top N recommendations for each user
for user_idx in range(n_users):
    user_id = reverse_user_map[user_idx]
    top_item_ids = [reverse_item_map[item_idx] for item_idx in top_items_per_user[user_idx]]
    print(f"Top {N} recommendations for {user_id}: {top_item_ids}")

Output hidden; open in https://colab.research.google.com to view.

### Download Score Matrix

In [None]:
df_score.to_csv('/content/drive/My Drive/BT4222_Group_3_Submission/score_matrices/scoring_matrix_implicit.csv')