In [2]:
!pip install torch_geometric
!pip install torch
!pip install torch_sparse
!pip install torch_scatter

Collecting torch_geometric
  Using cached torch_geometric-2.2.0-py3-none-any.whl
Collecting scikit-learn
  Using cached scikit_learn-1.2.1-cp310-cp310-macosx_12_0_arm64.whl (8.4 MB)
Collecting psutil>=5.8.0
  Using cached psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl (244 kB)
Collecting jinja2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting MarkupSafe>=2.0
  Using cached MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_universal2.whl (17 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, psutil, MarkupSafe, joblib, scikit-learn, jinja2, torch_geometric
Successfully installed MarkupSafe-2.1.2 jinja2-3.1.2 joblib-1.2.0 psutil-5.9.4 scikit-learn-1.2.1 threadpoolctl-3.1.0 torch_geometric-2.2.0
Collecting torch
  Using cached torch-1.13.1-cp310-none-macosx_11_0_arm64.whl (53.2 MB)
Collecting typing-extensions
  Down

In [2]:
from torch_geometric.nn.models.lightgcn import LightGCN
import pandas as pd
import os
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load Data
We can begin by loading in the user review data. For each user, we have a subset of the movies that they reviewed. We'll load each of the CSVs as dataframes, and store a dict of user IDs corresponding to their dataframes.

In [3]:
# for now we will use the first 10k rows of the data, set to None to use all data
AMOUNT_TO_LOAD = 10000

In [6]:
user_reviews_dir = 'user_reviews'
user_review_data = dict()

for filename in tqdm(os.listdir(user_reviews_dir)):
    if AMOUNT_TO_LOAD is not None and len(user_review_data) >= AMOUNT_TO_LOAD:
        break
    try:
        user_review_data[filename] = pd.read_csv(os.path.join(user_reviews_dir, filename), encoding='unicode_escape')
    except pd.errors.EmptyDataError:
        print(f'Empty file: {filename}')
        pass

Empty file: sebastian823_reviews.csv


Now let's split the data into training, validation, and test sets. Since this is a recommender, we're gonna split by removing some of the user's reviews.

For every user, so long as the user has more than 5 reviews, remove one review for the validation set and one review for the test set.

In [13]:
print(list(user_review_data.keys())[0])

asel82_reviews.csv


In [20]:
train_reviews = []
validation_reviews = []
test_reviews = []
for user_id, reviews in tqdm(user_review_data.items()):
    if len(reviews) > 5:
        # randomly remove one review from the user's reviews for the test set and one for the validation set
        reviews_to_remove = reviews.sample(2)
        # test data
        test_review_data = reviews_to_remove.iloc[0].to_dict()
        test_review_data['user_id'] = user_id
        test_reviews.append(test_review_data)
        # validation data
        validation_review_data = reviews_to_remove.iloc[1].to_dict()
        validation_review_data['user_id'] = user_id
        validation_reviews.append(validation_review_data)
        # train data
        train_review_data = reviews.drop(reviews_to_remove.index).to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        train_reviews.extend(train_review_data)
    else:
        # if the user has less than 5 reviews, we will use all of them for training
        train_review_data = reviews.to_dict('records')
        for review in train_review_data:
            review['user_id'] = user_id
        train_reviews.extend(train_review_data)

print(f'Train reviews: {len(train_reviews)}')
print(f'Validation reviews: {len(validation_reviews)}')
print(f'Test reviews: {len(test_reviews)}')

100%|██████████| 10000/10000 [00:11<00:00, 884.43it/s]

Train reviews: 5322200
Validation reviews: 9857
Test reviews: 9857





In [21]:
train_reviews[0]

{'movie_title': 'All Too Well: The Short Film',
 'movie_rating': 4.0,
 'movie_id': 807762,
 'film_slug': '/film/all-too-well-the-short-film/',
 'user_id': 'asel82_reviews.csv'}

## Build the Model
Now that we have the training data, let's construct the model to train.

In [23]:
num_train_users = len(set([review['user_id'] for review in train_reviews]))
num_train_items = len(set([review['movie_id'] for review in train_reviews]))
num_nodes = num_train_users + num_train_items
print(f'Number of train users: {num_train_users}')
print(f'Number of train items: {num_train_items}')
print(f'Number of nodes: {num_nodes}')

Number of train users: 10000
Number of train items: 159981
Number of nodes: 169981


In [24]:
model = LightGCN(
    num_nodes=num_nodes,
    embedding_dim=64,
    num_layers=3
)

In [None]:
from torch.nn import Embedding
# Let's create two embedding models, one for users and one for items
user_embedding = Embedding(num_embeddings=num_train_users, embedding_dim=64)
item_embedding = Embedding(num_embeddings=num_train_items, embedding_dim=64)