In [86]:
import numpy as np
import torch
import pandas as pd
import torch.nn.functional as F
from torch.nn import Linear, MSELoss
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero, GCNConv
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import HeteroData
from torch_geometric.nn import GATv2Conv, RGCNConv, HeteroConv, GINConv
from torch_geometric.utils.dropout import *

data_folder = "../data/interim/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [87]:
device

device(type='cuda')

In [88]:
users = pd.read_csv(data_folder + "users.csv")
items = pd.read_csv(data_folder + "items.csv")
ratings = pd.read_csv(data_folder + "ratings.csv")
genres = pd.read_csv("../data/ml-100k/u.genre", delimiter="|", names=["name","index"])

In [89]:

src = ratings["user_id"] - 1
dst = ratings["item_id"] - 1
attrs = ratings["rating"]

edge_index = torch.tensor([src, dst], dtype=torch.int64)
edge_attr = torch.tensor(attrs)


In [90]:
from sentence_transformers import SentenceTransformer

def SequenceEncoder(movie_titles , model_name=None):
    model = SentenceTransformer(model_name, device=device)
    title_embeddings = model.encode(movie_titles, show_progress_bar=True,
                              convert_to_tensor=True, device=device)
    
    return title_embeddings.to("cpu")

item_title = SequenceEncoder(items["movie_title"], model_name='all-MiniLM-L6-v2')
item_genres = torch.tensor(items[genres.name].to_numpy(), dtype=torch.bool)
item_release_year = torch.tensor(items["release_year"].to_numpy()[:,np.newaxis], dtype=torch.int32)

item_x = torch.cat((item_title, item_genres), dim=-1).float()
# item_x = item_genres.float()

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [91]:
occupations = [i for i in users.keys() if i.startswith("occupation_")]  # Extract occupation columns
user_ages = torch.tensor(users["age"].to_numpy()[:,np.newaxis], dtype=torch.uint8)  # Convert user ages to tensor
user_sex = torch.tensor(users[["male", "female"]].to_numpy(), dtype=torch.bool)  # Convert user sex to tensor
user_occupation = torch.tensor(users[occupations].to_numpy(), dtype=torch.bool)  # Convert user occupations to tensor
user_x = torch.cat((user_ages, user_sex, user_occupation), dim=-1).float()  # Concatenate user features and convert to float
data = HeteroData()  # Create a new HeteroData object
data['user'].x = user_x  # Set user features in the HeteroData object
data['item'].x = item_x  # Set item features in the HeteroData object
data['user', 'rates', 'item'].edge_index = edge_index  # Set edge index for user-item ratings
data['user', 'rates', 'item'].edge_label = edge_attr  # Set edge label for user-item ratings


In [92]:
data = ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.
data = data.to(device)
# Perform a link-level split into training, validation, and test edges.
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'item')],
    rev_edge_types=[('item', 'rev_rates', 'user')],
)(data)

In [93]:
weight = torch.bincount(train_data['user', 'rates', 'item'].edge_label)
weight = weight.max() / weight
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [94]:
class GNNEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATv2Conv((-1, -1), 32, add_self_loops=False)
        self.conv2 = GATv2Conv((-1, -1), 32, add_self_loops=False)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)
        
    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['item'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder()
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)
        
    def forward(self, x_dict, edge_index_dict, edge_label_index):
        edge_label_index, mask = dropout_edge(edge_label_index, p=0.25, training=self.training)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index), mask

model = Model(hidden_channels=32).to(device)
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [95]:
def train():
    # Set the model to training mode
    model.train()
    # Zero the gradients
    optimizer.zero_grad()
    # Make a prediction using the model
    pred, mask = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'item'].edge_label_index)
    # Get the target values
    target = train_data['user', 'rates', 'item'].edge_label
    # Calculate the weighted mean squared error loss
    loss = weighted_mse_loss(pred, target[mask], weight)
    # Backpropagate the loss
    loss.backward()
    # Update the model's parameters
    optimizer.step()
    # Return the loss as a float
    return float(loss)

# Decorator to indicate that the following function does not require gradients
@torch.no_grad()
def test(data):
    # Set the model to evaluation mode
    model.eval()
    # Make a prediction using the model
    pred, _ = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'item'].edge_label_index)
    # Clamp the predictions between 0 and 5
    pred = pred.clamp(min=0, max=5)
    # Get the target values
    target = data['user', 'rates', 'item'].edge_label.float()
    # Calculate the root mean squared error
    rmse = F.mse_loss(pred, target).sqrt()
    # Return the root mean squared error as a float
    return float(rmse)

# Loop through 1000 epochs
for epoch in range(1, 1000):
    # Train the model and get the loss
    loss = train()
    # Test the model on the training data
    train_rmse = test(train_data)
    # Test the model on the validation data
    val_rmse = test(val_data)
    # Test the model on the test data
    test_rmse = test(test_data)
    # Print the epoch number, loss, and evaluation metrics
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 20.9727, Train: 3.7007, Val: 3.7005, Test: 3.6972
Epoch: 002, Loss: 18.9162, Train: 3.5384, Val: 3.5385, Test: 3.5342
Epoch: 003, Loss: 17.0646, Train: 3.3960, Val: 3.3965, Test: 3.3911
Epoch: 004, Loss: 15.6615, Train: 3.2554, Val: 3.2562, Test: 3.2496
Epoch: 005, Loss: 14.3275, Train: 3.1332, Val: 3.1344, Test: 3.1266
Epoch: 006, Loss: 13.2257, Train: 3.0223, Val: 3.0239, Test: 3.0151
Epoch: 007, Loss: 12.2586, Train: 2.9089, Val: 2.9109, Test: 2.9010
Epoch: 008, Loss: 11.3522, Train: 2.7956, Val: 2.7982, Test: 2.7872
Epoch: 009, Loss: 10.4791, Train: 2.7322, Val: 2.7352, Test: 2.7233
Epoch: 010, Loss: 10.0031, Train: 2.6734, Val: 2.6767, Test: 2.6641
Epoch: 011, Loss: 9.6013, Train: 2.6102, Val: 2.6139, Test: 2.6006
Epoch: 012, Loss: 9.1224, Train: 2.5430, Val: 2.5470, Test: 2.5331
Epoch: 013, Loss: 8.6774, Train: 2.4720, Val: 2.4765, Test: 2.4618
Epoch: 014, Loss: 8.2168, Train: 2.3975, Val: 2.4024, Test: 2.3869
Epoch: 015, Loss: 7.7362, Train: 2.3197, Val: 2.3251

In [96]:
from tqdm import tqdm



from tqdm import tqdm

total_users = len(users)
total_movies = len(items)
movie_recs = []
for user_id in tqdm(range(0, total_users)):
    user_row = torch.tensor([user_id] * total_movies)
    all_movie_ids = torch.arange(total_movies)
    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)
    pred, _ = model(data.x_dict, data.edge_index_dict,
             edge_label_index)
    rec_movie_ids = (pred > 4.2).nonzero(as_tuple=True) 
    top_ten_recs = [{'id': i+1, 'pred': pred[i].item()} for i in rec_movie_ids[0].tolist()[:20]]
    top_ten_recs.sort(key=lambda x: x['pred'], reverse=True)
    # top_ten_recs = [rec_movies + 1 for rec_movies in rec_movie_ids[0].tolist()] 
    
    # top_ten_recs.sort(key=lambda x: x, reverse=True)
    movie_recs.append({'user': user_id + 1, 'rec_movies': top_ten_recs})

100%|██████████| 943/943 [00:03<00:00, 236.16it/s]


In [97]:
movie_recs


[{'user': 1,
  'rec_movies': [{'id': 701, 'pred': 4.926219940185547},
   {'id': 1614, 'pred': 4.796771049499512},
   {'id': 1299, 'pred': 4.739033222198486},
   {'id': 1431, 'pred': 4.417767524719238},
   {'id': 1124, 'pred': 4.417666435241699},
   {'id': 653, 'pred': 4.377163410186768},
   {'id': 1451, 'pred': 4.363922119140625},
   {'id': 513, 'pred': 4.353892803192139},
   {'id': 603, 'pred': 4.315402030944824},
   {'id': 483, 'pred': 4.298532009124756},
   {'id': 484, 'pred': 4.295830249786377},
   {'id': 169, 'pred': 4.285086631774902},
   {'id': 611, 'pred': 4.270317554473877},
   {'id': 494, 'pred': 4.254264831542969},
   {'id': 474, 'pred': 4.249527931213379},
   {'id': 479, 'pred': 4.24490213394165},
   {'id': 607, 'pred': 4.238849639892578},
   {'id': 114, 'pred': 4.230071544647217},
   {'id': 478, 'pred': 4.207766532897949},
   {'id': 1154, 'pred': 4.200404167175293}]},
 {'user': 2,
  'rec_movies': [{'id': 483, 'pred': 4.489522457122803},
   {'id': 484, 'pred': 4.48682069778

In [98]:
movie_ids = ratings[ratings.user_id == 2].iterrows()
for i in movie_ids:
    _, row = i
    item = items[items["movie_id"] == row.item_id]
    mean_rating = ratings[ratings["item_id"] == row.item_id]["rating"].mean()
    if(row.rating>4):
        print(item["movie_title"].tolist()[0], row.rating, str(mean_rating)[:3])

Shall We Dance? (1996) 5 4.2
Star Wars (1977) 5 4.3
As Good As It Gets (1997) 5 4.1
Titanic (1997) 5 4.2
Kolya (1996) 5 3.9
Emma (1996) 5 3.7
Wings of the Dove, The (1997) 5 3.6
Fargo (1996) 5 4.1
Godfather, The (1972) 5 4.2
Secrets & Lies (1996) 5 4.2
Good Will Hunting (1997) 5 4.2
Sense and Sensibility (1995) 5 4.0
L.A. Confidential (1997) 5 4.1


In [100]:
for i in movie_recs[2 - 1]["rec_movies"]:
    movie = items[items["movie_id"] == i["id"]]
    movie_id = movie["movie_id"].tolist()[0]
    mean_rating = ratings[ratings["item_id"] == movie_id]["rating"].mean()
    rated = ratings[ratings["item_id"] == movie_id]["user_id"].notnull().sum()
    print(movie["movie_title"].tolist()[0],i["pred"], str(mean_rating)[:3], rated)

Casablanca (1942) 4.489522457122803 4.4 243
Maltese Falcon, The (1941) 4.486820697784424 4.2 138
Wrong Trousers, The (1993) 4.476077556610107 4.4 118
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) 4.440518379211426 4.2 194
Vertigo (1958) 4.435892105102539 4.2 179
Wallace & Gromit: The Best of Aardman Animation (1996) 4.421061038970947 4.4 67
Philadelphia Story, The (1940) 4.398756504058838 4.1 104
Citizen Kane (1941) 4.335771083831787 4.2 198
Notorious (1946) 4.329782485961914 4.1 52
Shawshank Redemption, The (1994) 4.327103614807129 4.4 283
To Kill a Mockingbird (1962) 4.298089027404785 4.2 219
Star Wars (1977) 4.287441253662109 4.3 583
Raiders of the Lost Ark (1981) 4.272298812866211 4.2 420
Princess Bride, The (1987) 4.261465072631836 4.1 324
To Catch a Thief (1955) 4.2556562423706055 4.0 50
Some Like It Hot (1959) 4.239623546600342 3.9 128
Apartment, The (1960) 4.234771251678467 3.9 63
North by Northwest (1959) 4.233880996704102 4.2 179
Sabrina (1954) 4