In [2]:
!pip install torch_geometric
!pip install torch
!pip install torch_sparse
!pip install torch_scatter

Collecting torch_geometric
  Using cached torch_geometric-2.2.0-py3-none-any.whl
Collecting scikit-learn
  Using cached scikit_learn-1.2.1-cp310-cp310-macosx_12_0_arm64.whl (8.4 MB)
Collecting psutil>=5.8.0
  Using cached psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl (244 kB)
Collecting jinja2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting MarkupSafe>=2.0
  Using cached MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_universal2.whl (17 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, psutil, MarkupSafe, joblib, scikit-learn, jinja2, torch_geometric
Successfully installed MarkupSafe-2.1.2 jinja2-3.1.2 joblib-1.2.0 psutil-5.9.4 scikit-learn-1.2.1 threadpoolctl-3.1.0 torch_geometric-2.2.0
Collecting torch
  Using cached torch-1.13.1-cp310-none-macosx_11_0_arm64.whl (53.2 MB)
Collecting typing-extensions
  Down

In [1]:
from torch_geometric.nn.models.lightgcn import LightGCN
import pandas as pd
import os
from tqdm import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Load Data
We can begin by loading in the user review data. For each user, we have a subset of the movies that they reviewed. We'll load each of the CSVs as dataframes, and store a dict of user IDs corresponding to their dataframes.

In [2]:
# for now we will use the first 10k rows of the data, set to None to use all data
AMOUNT_TO_LOAD = 100
EMBEDDING_DIM = 64

In [3]:
user_reviews_dir = 'user_reviews'
user_review_data = dict()

for filename in tqdm(os.listdir(user_reviews_dir)):
    if AMOUNT_TO_LOAD is not None and len(user_review_data) >= AMOUNT_TO_LOAD:
        break
    try:
        user_review_data[filename] = pd.read_csv(os.path.join(user_reviews_dir, filename), encoding='unicode_escape')
    except pd.errors.EmptyDataError:
        print(f'Empty file: {filename}')
        pass

  0%|          | 100/63111 [00:00<01:17, 810.19it/s]


Now let's split the data into training, validation, and test sets. Since this is a recommender, we're gonna split by removing some of the user's reviews.

For every user, so long as the user has more than 5 reviews, remove one review for the validation set and one review for the test set.

In [4]:
print(list(user_review_data.keys())[0])

asel82_reviews.csv


In [5]:
train_reviews = []
validation_reviews = []
test_reviews = []
for user_id, reviews in tqdm(user_review_data.items()):
    if len(reviews) > 5:
        # randomly remove one review from the user's reviews for the test set and one for the validation set
        reviews_to_remove = reviews.sample(2)
        # test data
        test_review_data = reviews_to_remove.iloc[0].to_dict()
        test_review_data['user_id'] = user_id
        test_reviews.append(test_review_data)
        # validation data
        validation_review_data = reviews_to_remove.iloc[1].to_dict()
        validation_review_data['user_id'] = user_id
        validation_reviews.append(validation_review_data)
        # train data
        train_review_data = reviews.drop(reviews_to_remove.index).to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        
        if (user_id == "ahmad97_reviews.csv"):
          print(len(train_review_data), len(reviews_to_remove))
        train_reviews.extend(train_review_data)
    else:
        # if the user has less than 5 reviews, we will use all of them for training
        train_review_data = reviews.to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        train_reviews.extend(train_review_data)

print(f'Train reviews: {len(train_reviews)}')
print(f'Validation reviews: {len(validation_reviews)}')
print(f'Test reviews: {len(test_reviews)}')

100%|██████████| 100/100 [00:00<00:00, 948.65it/s]

518 2
Train reviews: 45290
Validation reviews: 96
Test reviews: 96





In [6]:
train_reviews[0]

{'movie_title': 'All Too Well: The Short Film',
 'movie_rating': 4.0,
 'movie_id': 807762,
 'film_slug': '/film/all-too-well-the-short-film/',
 'user_id': 'asel82_reviews.csv'}

## Build the Model
Now that we have the training data, let's construct the model to train.

In [7]:
num_train_users = len(set([review['user_id'] for review in train_reviews]))
num_train_items = len(set([review['movie_id'] for review in train_reviews]))
num_nodes = num_train_users + num_train_items
print(f'Number of train users: {num_train_users}')
print(f'Number of train items: {num_train_items}')
print(f'Number of nodes: {num_nodes}')

Number of train users: 100
Number of train items: 12618
Number of nodes: 12718


In [8]:
# Let's map users to ids
movie_id_to_movie_name = dict()
for review in train_reviews:
    movie_id_to_movie_name[review['movie_id']] = review['movie_title']

user_to_id = dict()
for i, user_id in enumerate(set([review['user_id'] for review in train_reviews])):
    user_to_id[user_id] = i

# Let's map movies to ids
movie_to_id = dict()
for i, movie_id in enumerate(set([review['movie_id'] for review in train_reviews])):
    movie_to_id[movie_id] = i + num_train_users

# Let's map ids to users
id_to_user = dict()
for user_id, index in user_to_id.items():
    id_to_user[index] = user_id

# Let's map ids to movies
id_to_movie = dict()
for movie_id, index in movie_to_id.items():
    id_to_movie[index] = movie_id

In [9]:
# Let's remove any data in our validation and test sets that have ids that are not in our training set
# Before removal:
print(f'Validation reviews: {len(validation_reviews)}')
print(f'Test reviews: {len(test_reviews)}')

# Removal
validation_reviews = [review for review in validation_reviews if review['user_id'] in user_to_id and review['movie_id'] in movie_to_id]
test_reviews = [review for review in test_reviews if review['user_id'] in user_to_id and review['movie_id'] in movie_to_id]

# After removal:
print(f'Validation reviews: {len(validation_reviews)}')
print(f'Test reviews: {len(test_reviews)}')

Validation reviews: 96
Test reviews: 96
Validation reviews: 88
Test reviews: 83


In [10]:
import random

def convert_review_to_edge(review):
    user_id = user_to_id[review['user_id']]
    movie_id = movie_to_id[review['movie_id']]
    edge_weight = review['movie_rating']
    if (edge_weight < 3.5 and edge_weight > 2.5):
        return None, None
    edge = (user_id, movie_id)
    edge_weight = review['movie_rating']
    return edge, edge_weight

def shuffle_edges_and_edge_weights(edges, edge_weights):
    c = list(zip(edges, edge_weights))
    random.shuffle(c)
    return zip(*c)

def convert_reviews_to_edges(reviews):
    edges = []
    edge_weights = []
    for review in tqdm(reviews):
        edge, edge_weight = convert_review_to_edge(review)
        if edge is not None:
            edges.append(edge)
            edge_weights.append(edge_weight)
    
    # Reformat the edges to be a tensor
    edges = torch.tensor(edges, dtype=torch.long).t().contiguous()
    return edges, edge_weights

In [11]:
# Now let's create the edges between users and movies.
# The id of the user will be the index of the user in the user_to_id dict
# The id of the movie will be the index of the movie in the movie_to_id dict + the number of users

train_edges, train_edge_weights = convert_reviews_to_edges(train_reviews)
validation_edges, validation_edge_weights = convert_reviews_to_edges(validation_reviews)

print(f'Train edges: {train_edges.shape[1]}')
print(f'Validation edges: {validation_edges.shape[1]}')

100%|██████████| 45290/45290 [00:00<00:00, 2479442.77it/s]
100%|██████████| 88/88 [00:00<00:00, 792057.41it/s]

Train edges: 38675
Validation edges: 77





In [12]:
import torch_geometric.data as data

# create the graph
train_graph = data.Data(
    edge_index=train_edges,
    edge_attr=torch.tensor(train_edge_weights),
    num_nodes=num_nodes
)

validation_graph = data.Data(
    edge_index=validation_edges,
    edge_attr=torch.tensor(validation_edge_weights),
    num_nodes=num_nodes
)

In [13]:
train_graph.validate(raise_on_error=True)
validation_graph.validate(raise_on_error=True)

True

In [14]:
def resample_edges(positive_edges, negative_edges):
    # For every user, determine add fake negative edges until we have the same number of positive edges
    # We will do this by randomly creating negative edges for each user
    additional_negative_edges = []
    for user_id in range(num_train_users):
        # Get the positive edges for this user
        user_positive_edges = positive_edges[:, positive_edges[0] == user_id]
        # Get the negative edges for this user
        user_negative_edges = negative_edges[:, negative_edges[0] == user_id]
        # Determine how many negative edges we need to add
        num_negative_edges_to_add = user_positive_edges.shape[1] - user_negative_edges.shape[1]
        if (num_negative_edges_to_add <= 0):
            num_negative_edges_to_remove = -num_negative_edges_to_add
            # choose the negative edges to keep
            negative_edges_to_keep = torch.randint(user_negative_edges.shape[1], (user_negative_edges.shape[1] - num_negative_edges_to_remove,))
            # remove all the negative edges for this user
            negative_edges = negative_edges[:, negative_edges[0] != user_id]
            # add the negative edges to keep back to the negative edges
            negative_edges = torch.cat([negative_edges, user_negative_edges[:, negative_edges_to_keep]], dim=1)
        else:
            # Create the negative edges
            negative_edges_to_add = torch.tensor([[user_id] * num_negative_edges_to_add, torch.randint(num_train_users, num_train_items, (num_negative_edges_to_add,))], dtype=torch.long)
            # Add the negative edges to the list of additional negative edges
            additional_negative_edges.append(negative_edges_to_add)
    # Concatenate the additional negative edges
    additional_negative_edges = torch.cat(additional_negative_edges, dim=1)
    # Concatenate the additional negative edges with the existing negative edges
    negative_edges = torch.cat([negative_edges, additional_negative_edges], dim=1)
    return positive_edges, negative_edges
        
        

In [15]:
def compute_precision_at_k_memory_efficient(model, num_items, num_users, positive_edges, k=5):
    model.eval()
    # select a random subset of 1000 users
    users = torch.randperm(num_users)[:1000]
    # for each user, use a heapq to keep track of the top k items
    top_k_items = [list() for _ in range(1000)]
    print("Created top k items")
    with torch.no_grad():
        # we're going to go over all possible item, user pairs, but we're going to do it in batches
        for user_id in tqdm(users):
            all_edges_for_user = torch.tensor([(user_id, item_id) for item_id in range(num_items)], dtype=torch.long).t().contiguous()
            top_k_items[user_id] = model(all_edges_for_user).topk(k=k, dim=0)[1].tolist()
        # Check how many of the top k items are in the positive edges
        num_correct = 0
        for user_id, positive_items in enumerate(positive_edges):
            # increment num_correct if the edge (user_id, item_id) is in the positive edges
            num_correct += len(set(top_k_items[user_id]) & set(positive_items.tolist()))
        precision_at_k = num_correct / (num_users * k)
    return precision_at_k

    

In [16]:
# Let's put this on tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [17]:
import torch.nn as nn

class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()
    
    def forward(self, model, train_positive_edges, train_negative_edges, num_users):
        loss = torch.tensor(0.0, requires_grad=True)
        for user_id in range(num_users):
            # get all the positive and negative edges for this user
            user_positive_edges = train_positive_edges[:, train_positive_edges[0] == user_id]
            user_negative_edges = train_negative_edges[:, train_negative_edges[0] == user_id]
            if (user_positive_edges.shape[1] == 0 or user_negative_edges.shape[1] == 0):
                continue
            # compute the embeddings for all the positive and negative edges
            positive_edge_embeddings = model.get_embedding(user_positive_edges)
            negative_edge_embeddings = model.get_embedding(user_negative_edges)
            # compute the pairwise differences
            pairwise_differences = positive_edge_embeddings.unsqueeze(1) - negative_edge_embeddings
            # for each pairwise difference we want -log(sigmoid(x))
            # we can do this by doing log(1 + exp(-x))
            user_loss = torch.log(1 + torch.exp(-pairwise_differences)).sum() / (user_positive_edges.shape[1] * user_negative_edges.shape[1])
            loss = loss + user_loss
        loss = loss / num_users
        return loss

In [20]:
model = LightGCN(num_nodes=num_nodes, embedding_dim=EMBEDDING_DIM, num_layers=3)
optim = torch.optim.Adam(model.parameters(), lr=0.001)
loss = BPRLoss()

train_positive_edges = train_graph.edge_index[:, train_graph.edge_attr >= 3.5]
train_negative_edges = train_graph.edge_index[:, train_graph.edge_attr <= 2.5]

BATCH_SIZE = 64

train_positive_edges, train_negative_edges = resample_edges(train_positive_edges, train_negative_edges)

for epoch in tqdm(range(1000)):
    model.train()

    num_batches = train_positive_edges.shape[1] // BATCH_SIZE
    for i in range(0, train_positive_edges.shape[1], BATCH_SIZE):
        positive_edges = train_positive_edges[:, i:i+BATCH_SIZE]
        negative_edges = train_negative_edges[:, i:i+BATCH_SIZE]
        positive_ranks = model(positive_edges)
        negative_ranks = model(negative_edges)
        train_loss = loss(model, positive_edges, negative_edges, num_train_users)
        print("Epoch: {}, Batch: {}, Loss: {}".format(epoch, i // BATCH_SIZE, train_loss))
        writer.add_scalar('Loss/train', train_loss, epoch * num_batches + i // BATCH_SIZE)
        optim.zero_grad()
        train_loss.backward()
        optim.step()

    if (epoch % 100 == 0):
        # Let's print the top ranked movies for a few users
        model.eval()
        for user_id in range(10):
            all_edges_for_user = torch.tensor([(user_id, item_id) for item_id in range(num_train_users, num_nodes)], dtype=torch.long).t().contiguous()
            top_k_items_and_scores = model(all_edges_for_user).topk(k=5, dim=0)
            top_k_items = top_k_items_and_scores[1].tolist()
            top_k_scores = top_k_items_and_scores[0].tolist()
            top_k_movies = [id_to_movie[item_id] for item_id in top_k_items]
            top_k_movie_names = [movie_id_to_movie_name[movie_id] for movie_id in top_k_movies]
            print(f'User: {id_to_user[user_id]}, Top 5 items: {top_k_movie_names}, Top 5 scores: {top_k_scores}')

        # precision_at_k = compute_precision_at_k_memory_efficient(model, num_train_items, num_train_users, train_positive_edges)
        # print(f'Epoch: {epoch}, Precision at k: {precision_at_k}')
        # writer.add_scalar('Precision at k', precision_at_k, epoch)


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Batch: 0, Loss: 5294.31494140625
Epoch: 0, Batch: 1, Loss: 0.0
Epoch: 0, Batch: 2, Loss: 0.0
Epoch: 0, Batch: 3, Loss: 0.0
Epoch: 0, Batch: 4, Loss: 0.0
Epoch: 0, Batch: 5, Loss: 0.0
Epoch: 0, Batch: 6, Loss: 0.0
Epoch: 0, Batch: 7, Loss: 0.0
Epoch: 0, Batch: 8, Loss: 0.0
Epoch: 0, Batch: 9, Loss: 0.0
Epoch: 0, Batch: 10, Loss: 0.0
Epoch: 0, Batch: 11, Loss: 0.0
Epoch: 0, Batch: 12, Loss: 0.0
Epoch: 0, Batch: 13, Loss: 0.0
Epoch: 0, Batch: 14, Loss: 0.0
Epoch: 0, Batch: 15, Loss: 0.0
Epoch: 0, Batch: 16, Loss: 0.0
Epoch: 0, Batch: 17, Loss: 0.0
Epoch: 0, Batch: 18, Loss: 0.0
Epoch: 0, Batch: 19, Loss: 0.0
Epoch: 0, Batch: 20, Loss: 0.0
Epoch: 0, Batch: 21, Loss: 0.0
Epoch: 0, Batch: 22, Loss: 0.0
Epoch: 0, Batch: 23, Loss: 0.0
Epoch: 0, Batch: 24, Loss: 0.0
Epoch: 0, Batch: 25, Loss: 0.0
Epoch: 0, Batch: 26, Loss: 0.0
Epoch: 0, Batch: 27, Loss: 0.0
Epoch: 0, Batch: 28, Loss: 0.0
Epoch: 0, Batch: 29, Loss: 0.0
Epoch: 0, Batch: 30, Loss: 0.0
Epoch: 0, Batch: 31, Loss: 0.0
Epoch

: 

: 