In [39]:
import numpy as np
import torch
import pandas as pd
import torch.nn.functional as F
from torch.nn import Linear, MSELoss
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero, GCNConv
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import HeteroData
from torch_geometric.nn import GATv2Conv, RGCNConv, HeteroConv, GINConv
from torch_geometric.utils.dropout import *

data_folder = "../data/interim/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [40]:
device

device(type='cuda')

In [41]:
users = pd.read_csv(data_folder + "users.csv")
items = pd.read_csv(data_folder + "items.csv")
ratings = pd.read_csv(data_folder + "ratings.csv")
genres = pd.read_csv("../data/ml-100k/u.genre", delimiter="|", names=["name","index"])

In [42]:

src = ratings["user_id"] - 1
dst = ratings["item_id"] - 1
attrs = ratings["rating"]

edge_index = torch.tensor([src, dst], dtype=torch.int64)
edge_attr = torch.tensor(attrs)


In [43]:
from sentence_transformers import SentenceTransformer

def SequenceEncoder(movie_titles , model_name=None):
    model = SentenceTransformer(model_name, device=device)
    title_embeddings = model.encode(movie_titles, show_progress_bar=True,
                              convert_to_tensor=True, device=device)
    
    return title_embeddings.to("cpu")

item_title = SequenceEncoder(items["movie_title"], model_name='all-MiniLM-L6-v2')
item_genres = torch.tensor(items[genres.name].to_numpy(), dtype=torch.bool)
item_release_year = torch.tensor(items["release_year"].to_numpy()[:,np.newaxis], dtype=torch.int32)

item_x = torch.cat((item_title, item_genres), dim=-1).float()
# item_x = item_genres.float()

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [44]:
occupations = [i for i in users.keys() if i.startswith("occupation_")]  # Extract occupation columns
user_ages = torch.tensor(users["age"].to_numpy()[:,np.newaxis], dtype=torch.uint8)  # Convert user ages to tensor
user_sex = torch.tensor(users[["male", "female"]].to_numpy(), dtype=torch.bool)  # Convert user sex to tensor
user_occupation = torch.tensor(users[occupations].to_numpy(), dtype=torch.bool)  # Convert user occupations to tensor
user_x = torch.cat((user_ages, user_sex, user_occupation), dim=-1).float()  # Concatenate user features and convert to float
data = HeteroData()  # Create a new HeteroData object
data['user'].x = user_x  # Set user features in the HeteroData object
data['item'].x = item_x  # Set item features in the HeteroData object
data['user', 'rates', 'item'].edge_index = edge_index  # Set edge index for user-item ratings
data['user', 'rates', 'item'].edge_label = edge_attr  # Set edge label for user-item ratings


In [45]:
data = ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.
data = data.to(device)
# Perform a link-level split into training, validation, and test edges.
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'item')],
    rev_edge_types=[('item', 'rev_rates', 'user')],
)(data)

In [46]:
weight = torch.bincount(train_data['user', 'rates', 'item'].edge_label)
weight = weight.max() / weight
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [47]:
class GNNEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATv2Conv((-1, -1), 32, add_self_loops=False)
        self.conv2 = GATv2Conv((-1, -1), 32, add_self_loops=False)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)
        
    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['item'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder()
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)
        
    def forward(self, x_dict, edge_index_dict, edge_label_index):
        edge_label_index, mask = dropout_edge(edge_label_index, p=0.25, training=self.training)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index), mask

model = Model(hidden_channels=32).to(device)
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [48]:
def train():
    # Set the model to training mode
    model.train()
    # Zero the gradients
    optimizer.zero_grad()
    # Make a prediction using the model
    pred, mask = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'item'].edge_label_index)
    # Get the target values
    target = train_data['user', 'rates', 'item'].edge_label
    # Calculate the weighted mean squared error loss
    loss = weighted_mse_loss(pred, target[mask], weight)
    # Backpropagate the loss
    loss.backward()
    # Update the model's parameters
    optimizer.step()
    # Return the loss as a float
    return float(loss)

# Decorator to indicate that the following function does not require gradients
@torch.no_grad()
def test(data):
    # Set the model to evaluation mode
    model.eval()
    # Make a prediction using the model
    pred, _ = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'item'].edge_label_index)
    # Clamp the predictions between 0 and 5
    pred = pred.clamp(min=0, max=5)
    # Get the target values
    target = data['user', 'rates', 'item'].edge_label.float()
    # Calculate the root mean squared error
    rmse = F.mse_loss(pred, target).sqrt()
    # Return the root mean squared error as a float
    return float(rmse)

# Loop through 1000 epochs
for epoch in range(1, 1000):
    # Train the model and get the loss
    loss = train()
    # Test the model on the training data
    train_rmse = test(train_data)
    # Test the model on the validation data
    val_rmse = test(val_data)
    # Test the model on the test data
    test_rmse = test(test_data)
    # Print the epoch number, loss, and evaluation metrics
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 11.9672, Train: 2.7084, Val: 2.6878, Test: 2.7096
Epoch: 002, Loss: 9.8070, Train: 2.4416, Val: 2.4217, Test: 2.4430
Epoch: 003, Loss: 7.9835, Train: 2.1786, Val: 2.1596, Test: 2.1800
Epoch: 004, Loss: 6.4571, Train: 1.9222, Val: 1.9047, Test: 1.9239
Epoch: 005, Loss: 5.1961, Train: 1.6810, Val: 1.6654, Test: 1.6829
Epoch: 006, Loss: 4.2980, Train: 1.4653, Val: 1.4526, Test: 1.4676
Epoch: 007, Loss: 3.6560, Train: 1.2902, Val: 1.2817, Test: 1.2930
Epoch: 008, Loss: 3.3795, Train: 1.1718, Val: 1.1687, Test: 1.1753
Epoch: 009, Loss: 3.3941, Train: 1.1210, Val: 1.1236, Test: 1.1252
Epoch: 010, Loss: 3.6748, Train: 1.1193, Val: 1.1261, Test: 1.1240
Epoch: 011, Loss: 3.9619, Train: 1.1313, Val: 1.1403, Test: 1.1363
Epoch: 012, Loss: 4.1571, Train: 1.1366, Val: 1.1463, Test: 1.1417
Epoch: 013, Loss: 4.2568, Train: 1.1314, Val: 1.1404, Test: 1.1364
Epoch: 014, Loss: 4.2433, Train: 1.1215, Val: 1.1289, Test: 1.1263
Epoch: 015, Loss: 4.0219, Train: 1.1160, Val: 1.1211, Test: 1

In [49]:
from tqdm import tqdm



from tqdm import tqdm

total_users = len(users)
total_movies = len(items)
movie_recs = []
for user_id in tqdm(range(0, total_users)):
    user_row = torch.tensor([user_id] * total_movies)
    all_movie_ids = torch.arange(total_movies)
    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)
    pred, _ = model(data.x_dict, data.edge_index_dict,
             edge_label_index)
    rec_movie_ids = (pred > 4.2).nonzero(as_tuple=True) 
    top_ten_recs = [{'id': i+1, 'pred': pred[i].item()} for i in rec_movie_ids[0].tolist()[:20]]
    top_ten_recs.sort(key=lambda x: x['pred'], reverse=True)
    # top_ten_recs = [rec_movies + 1 for rec_movies in rec_movie_ids[0].tolist()] 
    
    # top_ten_recs.sort(key=lambda x: x, reverse=True)
    movie_recs.append({'user': user_id + 1, 'rec_movies': top_ten_recs})

100%|██████████| 943/943 [00:03<00:00, 237.83it/s]


In [50]:
movie_recs


[{'user': 1,
  'rec_movies': [{'id': 1156, 'pred': 5.463738918304443},
   {'id': 912, 'pred': 4.647536754608154},
   {'id': 701, 'pred': 4.602782249450684},
   {'id': 1299, 'pred': 4.3971710205078125},
   {'id': 1064, 'pred': 4.383208274841309},
   {'id': 528, 'pred': 4.335470199584961},
   {'id': 483, 'pred': 4.3347978591918945},
   {'id': 427, 'pred': 4.331717491149902},
   {'id': 653, 'pred': 4.30929708480835},
   {'id': 604, 'pred': 4.283228874206543},
   {'id': 513, 'pred': 4.268141746520996},
   {'id': 187, 'pred': 4.261445045471191},
   {'id': 1124, 'pred': 4.248292922973633},
   {'id': 484, 'pred': 4.237596035003662},
   {'id': 192, 'pred': 4.2332611083984375},
   {'id': 318, 'pred': 4.227624893188477},
   {'id': 654, 'pred': 4.223434925079346},
   {'id': 963, 'pred': 4.220477104187012},
   {'id': 603, 'pred': 4.2115373611450195},
   {'id': 489, 'pred': 4.2077131271362305}]},
 {'user': 2,
  'rec_movies': [{'id': 187, 'pred': 4.556572914123535},
   {'id': 134, 'pred': 4.48900890

In [51]:
movie_ids = ratings[ratings.user_id == 2].iterrows()
def get_movie_ids_already_rated(user_id):
    users_ratings = ratings[ratings["user_id"] == user_id]
    return users_ratings["item_id"].unique()
for i in movie_ids:
    _, row = i
    item = items[items["movie_id"] == row.item_id]
    mean_rating = ratings[ratings["item_id"] == row.item_id]["rating"].mean()
    if(row.rating>4):
        print(item["movie_title"].tolist()[0], row.rating, str(mean_rating)[:3])

Shall We Dance? (1996) 5 4.2
Star Wars (1977) 5 4.3
As Good As It Gets (1997) 5 4.1
Titanic (1997) 5 4.2
Kolya (1996) 5 3.9
Emma (1996) 5 3.7
Wings of the Dove, The (1997) 5 3.6
Fargo (1996) 5 4.1
Godfather, The (1972) 5 4.2
Secrets & Lies (1996) 5 4.2
Good Will Hunting (1997) 5 4.2
Sense and Sensibility (1995) 5 4.0
L.A. Confidential (1997) 5 4.1


In [68]:

already_rated_items: np.ndarray = get_movie_ids_already_rated(2)
for i in movie_recs[2 - 1]["rec_movies"]:
    if np.any(already_rated_items == i["id"]):
        # print(i)
        continue
    movie = items[items["movie_id"] == i["id"]]
    movie_id = movie["movie_id"].tolist()[0]
    mean_rating = ratings[ratings["item_id"] == movie_id]["rating"].mean()
    rated = ratings[ratings["item_id"] == movie_id]["user_id"].notnull().sum()
    print(movie["movie_title"].tolist()[0],i["pred"], str(mean_rating)[:3], rated)

Godfather: Part II, The (1974) 4.556572914123535 4.1 209
Citizen Kane (1941) 4.489008903503418 4.2 198
Apocalypse Now (1979) 4.414886474609375 4.0 221
Shawshank Redemption, The (1994) 4.403262138366699 4.4 283
Taxi Driver (1976) 4.3678717613220215 4.1 182
Empire Strikes Back, The (1980) 4.35827112197876 4.2 367
Raiders of the Lost Ark (1981) 4.33610200881958 4.2 420
Hoop Dreams (1994) 4.300160884857178 4.0 117
Wallace & Gromit: The Best of Aardman Animation (1996) 4.284985542297363 4.4 67
Blade Runner (1982) 4.274228572845459 4.1 275
Silence of the Lambs, The (1991) 4.261291027069092 4.2 390
Gone with the Wind (1939) 4.25250244140625 3.8 171
Princess Bride, The (1987) 4.237517356872559 4.1 324
Wizard of Oz, The (1939) 4.22615909576416 4.0 246
GoodFellas (1990) 4.217157363891602 3.9 226
Psycho (1960) 4.208418846130371 4.1 239
Jean de Florette (1986) 4.208373546600342 4.1 64
12 Angry Men (1957) 4.201621055603027 4.3 125


# References
https://medium.com/arangodb/integrate-arangodb-with-pytorch-geometric-to-build-recommendation-systems-dd69db688465