<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/pyg2neo/Movie_recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence_transformers neo4j

Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 953 kB/s 
[?25hCollecting neo4j
  Downloading neo4j-4.4.1.tar.gz (89 kB)
[K     |████████████████████████████████| 89 kB 6.9 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 45.9 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 20.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 43.5 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
Collecting sacremoses
  Do

In [2]:
!pip install torch-scatter torch-sparse torch-cluster torch-geometric -f https://data.pyg.org/whl/torch-1.10.0+cpu.html

Looking in links: https://data.pyg.org/whl/torch-1.10.0+cpu.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcpu/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (291 kB)
[K     |████████████████████████████████| 291 kB 4.9 MB/s 
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcpu/torch_sparse-0.6.12-cp37-cp37m-linux_x86_64.whl (639 kB)
[K     |████████████████████████████████| 639 kB 43.1 MB/s 
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcpu/torch_cluster-1.5.9-cp37-cp37m-linux_x86_64.whl (328 kB)
[K     |████████████████████████████████| 328 kB 37.2 MB/s 
[?25hCollecting torch-geometric
  Downloading torch_geometric-2.0.3.tar.gz (370 kB)
[K     |████████████████████████████████| 370 kB 5.3 MB/s 
Collecting rdflib
  Downloading rdflib-6.1.1-py3-none-any.whl (482 kB)
[K     |████████████████████████████████| 482 kB 49.4 MB/s 
Collecting yacs
  Downloading yacs-0.1.8-py3-

In [3]:
import torch
import pandas as pd
from torch.nn import Linear
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero

from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
print(device)

cpu


In [5]:
from neo4j import GraphDatabase

url= 'bolt://3.86.43.255:7687'
user = 'neo4j'
password = 'company-science-journals'

driver = GraphDatabase.driver(url, auth=(user, password))

def fetch_data(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame([r.values() for r in result], columns=result.keys())


In [6]:
def load_node(cypher, index_col, encoders=None, **kwargs):
    df = fetch_data(cypher)
    df.set_index(index_col, inplace=True)
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [7]:
def load_edge(cypher, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    df = fetch_data(cypher)

    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])

    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

In [8]:
class SequenceEncoder(object):
    # The 'SequenceEncoder' encodes raw column strings into embeddings.
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [9]:
class GenresEncoder(object):
    # The 'GenreEncoder' splits the raw column strings by 'sep' and converts
    # individual elements to categorical labels.
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(g for col in df.values for g in col.split(self.sep))
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x

In [33]:
class IdentityEncoder(object):
    # The 'IdentityEncoder' takes the raw column values and converts them to
    # PyTorch tensors.
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).to(self.dtype)

In [34]:
user_x, user_mapping = load_node("MATCH (u:User) RETURN u.userId AS userId" , index_col='userId')


In [12]:
movie_query = """
MATCH (m:Movie)-[:IN_GENRE]->(genre:Genre)
WITH m, collect(genre.name) AS genres_list
RETURN m.movieId AS movieId, m.title AS title, apoc.text.join(genres_list, '|') AS genres
"""

movie_x, movie_mapping = load_node(
    movie_query, 
    index_col='movieId', encoders={
        'title': SequenceEncoder(),
        'genres': GenresEncoder()
    })

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/286 [00:00<?, ?it/s]

In [35]:
rating_query = """
MATCH (u:User)-[r:RATED]->(m:Movie) 
RETURN u.userId AS userId, m.movieId AS movieId, r.rating AS rating
"""

edge_index, edge_label = load_edge(
    rating_query,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

In [36]:
data = HeteroData()
data['user'].num_nodes = len(user_mapping)  # Users do not have any features.
# Add user node features for message passing:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)
del data['user'].num_nodes

data['movie'].x = movie_x
data['user', 'rates', 'movie'].edge_index = edge_index
data['user', 'rates', 'movie'].edge_label = edge_label
data.to(device)

HeteroData(
  [1muser[0m={ x=[671, 671] },
  [1mmovie[0m={ x=[9125, 404] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100004],
    edge_label=[100004]
  }
)

In [31]:
import os.path as osp
from torch_geometric.datasets import MovieLens
dataset = 'MyLens'
path = osp.join('.', 'data', dataset)
dataset = MovieLens(path, model_name='all-MiniLM-L6-v2')
data_nice = dataset[0].to(device)

In [37]:
data = ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)
train_data, val_data, test_data = transform(data)

In [38]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

In [39]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [40]:
model = Model(hidden_channels=32).to(device)

In [42]:
weight = torch.bincount(train_data['user', 'movie'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [43]:
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [44]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_label_index)
    target = train_data['user', 'movie'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [46]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [47]:
for epoch in range(1, 300):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 18.1004, Train: 3.2379, Val: 3.2330, Test: 3.2382
Epoch: 002, Loss: 16.2975, Train: 2.9616, Val: 2.9584, Test: 2.9636
Epoch: 003, Loss: 13.7349, Train: 2.4912, Val: 2.4913, Test: 2.4967
Epoch: 004, Loss: 10.1835, Train: 1.7573, Val: 1.7625, Test: 1.7685
Epoch: 005, Loss: 6.8211, Train: 1.1065, Val: 1.1074, Test: 1.1145
Epoch: 006, Loss: 8.0671, Train: 1.1376, Val: 1.1337, Test: 1.1404
Epoch: 007, Loss: 9.3780, Train: 1.1089, Val: 1.1123, Test: 1.1197
Epoch: 008, Loss: 7.4036, Train: 1.3432, Val: 1.3500, Test: 1.3569
Epoch: 009, Loss: 6.2444, Train: 1.6741, Val: 1.6793, Test: 1.6857
Epoch: 010, Loss: 6.5007, Train: 1.8877, Val: 1.8915, Test: 1.8975
Epoch: 011, Loss: 7.0878, Train: 1.9598, Val: 1.9631, Test: 1.9691
Epoch: 012, Loss: 7.3351, Train: 1.9107, Val: 1.9145, Test: 1.9206
Epoch: 013, Loss: 7.1261, Train: 1.7668, Val: 1.7719, Test: 1.7783
Epoch: 014, Loss: 6.6227, Train: 1.5553, Val: 1.5622, Test: 1.5692
Epoch: 015, Loss: 6.1015, Train: 1.3245, Val: 1.3332, Test

In [50]:
user_id = 0
num_movies = len(movie_mapping)

row = torch.tensor([user_id] * num_movies)
col = torch.arange(num_movies)
edge_label_index = torch.stack([row, col], dim=0)

pred = model(data.x_dict, data.edge_index_dict,
              edge_label_index)
pred = pred.clamp(min=0, max=5)

In [60]:
reverse_movie_mapping = dict(zip(movie_mapping.values(),movie_mapping.keys()))

for i,j in enumerate(pred):
  score = j.detach().numpy()
  if score > 4:
    movie_id = reverse_movie_mapping[i]
    print(score, movie_id)

4.0598617 110352
4.0120296 108979
4.1322947 62331
4.185063 64695
4.194876 71433
4.548223 34435
4.172068 26265
4.475247 31923
4.508934 8341
4.0386944 8591
4.7643223 8537
5.0 7087
4.2244244 7700
4.04606 7771
4.1852484 7840
4.4977345 5300
4.2502284 5069
4.2183247 4688
4.347323 4933
4.8019185 4567
4.5926213 4591
4.1736975 3926
4.191746 2893
4.5188985 2880
4.401515 2625
4.417151 107559
4.054242 106762
5.0 83603
5.0 72612
4.0232973 66335
4.2175603 67429
4.2942324 31184
4.03687 6713
4.6466603 2810
4.0433006 110461
4.1122184 8761
4.749738 755
5.0 114265
5.0 110645
4.23756 109483
4.144233 108514
4.9258256 143859
4.746245 148956
4.2014356 141886
4.0177484 129313
4.3455296 128846
5.0 82931
4.3412914 80717
4.3096323 79469
4.4034314 77907
4.2927575 72781
4.4668264 99030
4.025861 93855
4.258225 91947
5.0 91690
4.854241 90061
4.061634 60128
4.9872293 61361
4.0194736 64321
4.299729 64660
4.4593635 70201
4.393086 48817
4.2616 49013
4.309868 52668
5.0 54328
4.5755296 55566
4.885948 39416
4.2300286 45000