In [1]:
import numpy as np
import torch
import pandas as pd
import torch.nn.functional as F
from torch.nn import Linear, MSELoss
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero, GCNConv
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import HeteroData
from torch_geometric.nn import GATv2Conv, RGCNConv, HeteroConv, GINConv
from torch_geometric.utils.dropout import *

data_folder = "../data/interim/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
device

device(type='cuda')

In [3]:
users = pd.read_csv(data_folder + "users.csv")
items = pd.read_csv(data_folder + "items.csv")
ratings = pd.read_csv(data_folder + "ratings.csv")
genres = pd.read_csv("../data/ml-100k/u.genre", delimiter="|", names=["name","index"])

In [4]:

src = ratings["user_id"] - 1
dst = ratings["item_id"] - 1
attrs = ratings["rating"]

edge_index = torch.tensor([src, dst], dtype=torch.int64)
edge_attr = torch.tensor(attrs)


In [5]:
from sentence_transformers import SentenceTransformer

def SequenceEncoder(movie_titles , model_name=None):
    model = SentenceTransformer(model_name, device=device)
    title_embeddings = model.encode(movie_titles, show_progress_bar=True,
                              convert_to_tensor=True, device=device)
    
    return title_embeddings.to("cpu")

item_title = SequenceEncoder(items["movie_title"], model_name='all-MiniLM-L6-v2')
item_genres = torch.tensor(items[genres.name].to_numpy(), dtype=torch.bool)
item_release_year = torch.tensor(items["release_year"].to_numpy()[:,np.newaxis], dtype=torch.int32)

item_x = torch.cat((item_title, item_genres), dim=-1).float()
# item_x = item_genres.float()

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [6]:
occupations = [i for i in users.keys() if i.startswith("occupation_")]  # Extract occupation columns
user_ages = torch.tensor(users["age"].to_numpy()[:,np.newaxis], dtype=torch.uint8)  # Convert user ages to tensor
user_sex = torch.tensor(users[["male", "female"]].to_numpy(), dtype=torch.bool)  # Convert user sex to tensor
user_occupation = torch.tensor(users[occupations].to_numpy(), dtype=torch.bool)  # Convert user occupations to tensor
user_x = torch.cat((user_ages, user_sex, user_occupation), dim=-1).float()  # Concatenate user features and convert to float
data = HeteroData()  # Create a new HeteroData object
data['user'].x = user_x  # Set user features in the HeteroData object
data['item'].x = item_x  # Set item features in the HeteroData object
data['user', 'rates', 'item'].edge_index = edge_index  # Set edge index for user-item ratings
data['user', 'rates', 'item'].edge_label = edge_attr  # Set edge label for user-item ratings


In [7]:
data = ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.
data = data.to(device)
# Perform a link-level split into training, validation, and test edges.
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'item')],
    rev_edge_types=[('item', 'rev_rates', 'user')],
)(data)

In [8]:
weight = torch.bincount(train_data['user', 'rates', 'item'].edge_label)
weight = weight.max() / weight
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [9]:
class GNNEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATv2Conv((-1, -1), 32, add_self_loops=False)
        self.conv2 = GATv2Conv((-1, -1), 32, add_self_loops=False)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)
        
    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['item'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder()
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)
        
    def forward(self, x_dict, edge_index_dict, edge_label_index):
        edge_label_index, mask = dropout_edge(edge_label_index, p=0.25, training=self.training)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index), mask

model = Model(hidden_channels=32).to(device)
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
def train():
    # Set the model to training mode
    model.train()
    # Zero the gradients
    optimizer.zero_grad()
    # Make a prediction using the model
    pred, mask = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'item'].edge_label_index)
    # Get the target values
    target = train_data['user', 'rates', 'item'].edge_label
    # Calculate the weighted mean squared error loss
    loss = weighted_mse_loss(pred, target[mask], weight)
    # Backpropagate the loss
    loss.backward()
    # Update the model's parameters
    optimizer.step()
    # Return the loss as a float
    return float(loss)

# Decorator to indicate that the following function does not require gradients
@torch.no_grad()
def test(data):
    # Set the model to evaluation mode
    model.eval()
    # Make a prediction using the model
    pred, _ = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'item'].edge_label_index)
    # Clamp the predictions between 0 and 5
    pred = pred.clamp(min=0, max=5)
    # Get the target values
    target = data['user', 'rates', 'item'].edge_label.float()
    # Calculate the root mean squared error
    rmse = F.mse_loss(pred, target).sqrt()
    # Return the root mean squared error as a float
    return float(rmse)

# Loop through 1000 epochs
for epoch in range(1, 1000):
    # Train the model and get the loss
    loss = train()
    # Test the model on the training data
    train_rmse = test(train_data)
    # Test the model on the validation data
    val_rmse = test(val_data)
    # Test the model on the test data
    test_rmse = test(test_data)
    # Print the epoch number, loss, and evaluation metrics
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 19.9797, Train: 3.5936, Val: 3.5965, Test: 3.5995
Epoch: 002, Loss: 17.6858, Train: 3.3886, Val: 3.3916, Test: 3.3952
Epoch: 003, Loss: 15.6118, Train: 3.1871, Val: 3.1902, Test: 3.1945
Epoch: 004, Loss: 13.7765, Train: 2.9915, Val: 2.9946, Test: 2.9995
Epoch: 005, Loss: 12.0884, Train: 2.8179, Val: 2.8210, Test: 2.8266
Epoch: 006, Loss: 10.7187, Train: 2.6756, Val: 2.6785, Test: 2.6845
Epoch: 007, Loss: 9.6255, Train: 2.5384, Val: 2.5410, Test: 2.5474
Epoch: 008, Loss: 8.7347, Train: 2.4014, Val: 2.4037, Test: 2.4104
Epoch: 009, Loss: 7.8586, Train: 2.2657, Val: 2.2676, Test: 2.2746
Epoch: 010, Loss: 7.0735, Train: 2.1330, Val: 2.1344, Test: 2.1417
Epoch: 011, Loss: 6.3410, Train: 2.0054, Val: 2.0060, Test: 2.0136
Epoch: 012, Loss: 5.7748, Train: 1.8825, Val: 1.8826, Test: 1.8903
Epoch: 013, Loss: 5.2760, Train: 1.7659, Val: 1.7651, Test: 1.7735
Epoch: 014, Loss: 4.9177, Train: 1.6575, Val: 1.6561, Test: 1.6650
Epoch: 015, Loss: 4.6727, Train: 1.5604, Val: 1.5586, Te

In [11]:
from tqdm import tqdm



from tqdm import tqdm

total_users = len(users)
total_movies = len(items)
movie_recs = []
for user_id in tqdm(range(0, total_users)):
    user_row = torch.tensor([user_id] * total_movies)
    all_movie_ids = torch.arange(total_movies)
    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)
    pred, _ = model(data.x_dict, data.edge_index_dict,
             edge_label_index)
    rec_movie_ids = (pred > 4.2).nonzero(as_tuple=True) 
    top_ten_recs = [{'id': i+1, 'pred': pred[i].item()} for i in rec_movie_ids[0].tolist()[:20]]
    top_ten_recs.sort(key=lambda x: x['pred'], reverse=True)
    # top_ten_recs = [rec_movies + 1 for rec_movies in rec_movie_ids[0].tolist()] 
    
    # top_ten_recs.sort(key=lambda x: x, reverse=True)
    movie_recs.append({'user': user_id + 1, 'rec_movies': top_ten_recs})

100%|██████████| 943/943 [00:03<00:00, 243.48it/s]


In [12]:
movie_recs


[{'user': 1,
  'rec_movies': [{'id': 701, 'pred': 4.417820453643799},
   {'id': 1673, 'pred': 4.379718780517578},
   {'id': 1674, 'pred': 4.335036277770996},
   {'id': 1500, 'pred': 4.277186393737793},
   {'id': 1154, 'pred': 4.269808769226074},
   {'id': 1299, 'pred': 4.256359100341797},
   {'id': 1156, 'pred': 4.222225189208984},
   {'id': 1614, 'pred': 4.213552951812744}]},
 {'user': 2,
  'rec_movies': [{'id': 114, 'pred': 4.5021891593933105},
   {'id': 172, 'pred': 4.473435401916504},
   {'id': 187, 'pred': 4.463823318481445},
   {'id': 134, 'pred': 4.430544376373291},
   {'id': 50, 'pred': 4.408908367156982},
   {'id': 192, 'pred': 4.384017467498779},
   {'id': 174, 'pred': 4.366206169128418},
   {'id': 194, 'pred': 4.32538366317749},
   {'id': 169, 'pred': 4.285211563110352},
   {'id': 181, 'pred': 4.245748519897461},
   {'id': 23, 'pred': 4.243741035461426},
   {'id': 64, 'pred': 4.235982418060303},
   {'id': 183, 'pred': 4.235084533691406},
   {'id': 178, 'pred': 4.233325481414

In [13]:
movie_ids = ratings[ratings.user_id == 2].iterrows()
def get_movie_ids_already_rated(user_id):
    users_ratings = ratings[ratings["user_id"] == user_id]
    return users_ratings["item_id"].unique()
for i in movie_ids:
    _, row = i
    item = items[items["movie_id"] == row.item_id]
    mean_rating = ratings[ratings["item_id"] == row.item_id]["rating"].mean()
    if(row.rating>4):
        print(item["movie_title"].tolist()[0], row.rating, str(mean_rating)[:3])

Shall We Dance? (1996) 5 4.2
Star Wars (1977) 5 4.3
As Good As It Gets (1997) 5 4.1
Titanic (1997) 5 4.2
Kolya (1996) 5 3.9
Emma (1996) 5 3.7
Wings of the Dove, The (1997) 5 3.6
Fargo (1996) 5 4.1
Godfather, The (1972) 5 4.2
Secrets & Lies (1996) 5 4.2
Good Will Hunting (1997) 5 4.2
Sense and Sensibility (1995) 5 4.0
L.A. Confidential (1997) 5 4.1


In [14]:

already_rated_items: np.ndarray = get_movie_ids_already_rated(user_id)
for i in movie_recs[2 - 1]["rec_movies"]:
    movie = items[items["movie_id"] == i["id"]]
    movie_id = movie["movie_id"].tolist()[0]
    if movie_id in already_rated_items:
        continue
    mean_rating = ratings[ratings["item_id"] == movie_id]["rating"].mean()
    rated = ratings[ratings["item_id"] == movie_id]["user_id"].notnull().sum()
    print(movie["movie_title"].tolist()[0],i["pred"], str(mean_rating)[:3], rated)

Wallace & Gromit: The Best of Aardman Animation (1996) 4.5021891593933105 4.4 67
Godfather: Part II, The (1974) 4.463823318481445 4.1 209
Citizen Kane (1941) 4.430544376373291 4.2 198
Raging Bull (1980) 4.384017467498779 4.1 116
Sting, The (1973) 4.32538366317749 4.0 241
Wrong Trousers, The (1993) 4.285211563110352 4.4 118
Return of the Jedi (1983) 4.245748519897461 4.0 507
Taxi Driver (1976) 4.243741035461426 4.1 182
Shawshank Redemption, The (1994) 4.235982418060303 4.4 283
12 Angry Men (1957) 4.233325481414795 4.3 125
Amadeus (1984) 4.220446586608887 4.1 276
Manon of the Spring (Manon des sources) (1986) 4.219354152679443 4.1 58
Silence of the Lambs, The (1991) 4.215931415557861 4.2 390
Gone with the Wind (1939) 4.215660095214844 3.8 171
Apocalypse Now (1979) 4.206822872161865 4.0 221
Princess Bride, The (1987) 4.201714038848877 4.1 324


# References
https://medium.com/arangodb/integrate-arangodb-with-pytorch-geometric-to-build-recommendation-systems-dd69db688465