<a href="https://colab.research.google.com/github/zzf7ktx/test_git/blob/main/ai_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

In [123]:
data_url = 'https://files.grouplens.org/datasets/movielens/ml-1m.zip'
data_url_1 = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'

In [73]:
import zipfile
import numpy as np
import pandas as pd

class MovieLens1m:
  def __init__(self, url: str):
    self.url = url
    self.__download()
    self.__read_users()
    self.__read_movies()
    self.__read_ratings()
    !rm ml-1m.zip
    !rm -rf ml-1m

  def __download(self):
    print('Downloading')
    zip_path = 'ml-1m.zip'
    !wget {self.url} -O {zip_path}

    print('Extracting')
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      zip_ref.extractall()
    self.base_path = 'ml-1m'

  def __read_ratings(self, filename='ratings.dat'):
    print('Reading ratings')
    headers = ['u_nodes', 'v_nodes', 'ratings', 'timestamp']
    dtypes = {
      'u_nodes': np.int64, 'v_nodes': np.int64,
      'ratings': np.float32, 'timestamp': np.float64
    }
    sep = r'\:\:'
    file_path = self.base_path + '/' + filename
    self.ratings_df = pd.read_csv(file_path, sep=sep, header=None,
                       names=headers,
                       converters=dtypes, engine='python',
                       encoding='latin-1')

  def __read_users(self, filename='users.dat'):
    print('Reading users')
    headers = ['user_id', 'gender', 'age', 'occupation', 'zip-code']
    sep = r'\:\:'
    file_path = self.base_path + '/' + filename
    self.users_df = pd.read_csv(file_path, sep=sep,header = None,
                       names = headers, engine= 'python',
                       encoding='latin-1')

  def __read_movies(self, filename='movies.dat'):
    print('Reading movies')
    headers = ['movie_id', 'title', 'genre']
    sep = r'\:\:'
    file_path = self.base_path + '/' + filename
    self.movies_df = pd.read_csv(file_path, sep=sep,header = None,
                       names = headers, engine= 'python',
                       encoding='latin-1')

class MovieLens100k:
  def __init__(self, url: str):
    self.url = url
    self.__download()
    self.__read_users()
    self.__read_movies()
    self.__read_ratings()
    !rm ml-100k.zip
    !rm -rf ml-100k

  def __download(self):
    print('Downloading')
    zip_path = 'ml-100k.zip'
    !wget {self.url} -O {zip_path}

    print('Extracting')
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      zip_ref.extractall()
    self.base_path = 'ml-100k'

  def __read_ratings(self, filename='u.data'):
    print('Reading ratings')
    headers = ['u_nodes', 'v_nodes', 'ratings', 'timestamp']
    dtypes = {
      'u_nodes': np.int64, 'v_nodes': np.int64,
      'ratings': np.float32, 'timestamp': np.float64
    }
    sep = r'\t'
    file_path = self.base_path + '/' + filename
    self.ratings_df = pd.read_csv(file_path, sep=sep, header=None,
                       names=headers,
                       converters=dtypes, engine='python',
                       encoding='latin-1')

  def __read_users(self, filename='u.user'):
    print('Reading users')
    headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
    sep = r'|'
    file_path = self.base_path + '/' + filename
    self.users_df = pd.read_csv(file_path, sep=sep,header = None,
                       names = headers, engine= 'python',
                       encoding='latin-1')

  def __read_movies(self, filename='u.item'):
    print('Reading movies')
    headers = ['movie id', 'movie title', 'release date', 'video release date',
               'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
               'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
               'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
               'Thriller', 'War', 'Western']
    sep = r'|'
    file_path = self.base_path + '/' + filename
    self.movies_df = pd.read_csv(file_path, sep=sep,header = None,
                       names = headers, engine= 'python',
                       encoding='latin-1')

In [124]:
dataset = MovieLens100k(data_url_1)

Downloading
--2025-07-08 07:53:00--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-07-08 07:53:02 (3.20 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Extracting
Reading users
Reading movies
Reading ratings


# Model

## IGMC

### Install

In [4]:
%%capture
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.6.0+cu124.html
!pip install torch-geometric

In [5]:
from __future__ import print_function
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
import os, sys, pdb, math, time
from copy import deepcopy
import multiprocessing as mp
import networkx as nx
import argparse
import scipy.io as sio
import scipy.sparse as ssp
import torch
from torch_geometric.data import Data, Dataset, InMemoryDataset
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')
import pickle as pkl
import scipy.sparse as sp
from torch_geometric.data import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Construct data

In [163]:
def subgraph_extraction_labeling(ind, Arow, Acol, h=1, sample_ratio=1.0,
                                 max_nodes_per_hop = None, u_features = None,
                                 v_features = None, class_values = None, y =1):
  # extract the h-hop enclosing subgraph around link 'ind'
  u_nodes, v_nodes = [ind[0]], [ind[1]]
  u_dist, v_dist = [0], [0]
  u_visited, v_visited = set([ind[0]]), set([ind[1]])
  u_fringe, v_fringe = set([ind[0]]), set([ind[1]])
  for dist in range(1, h+1):
    v_fringe, u_fringe = neighbors(u_fringe, Arow), neighbors(v_fringe, Acol)
    u_fringe = u_fringe - u_visited
    v_fringe = v_fringe - v_visited
    u_visited = u_visited.union(u_fringe)
    v_visited = v_visited.union(v_fringe)
    if sample_ratio < 1.0:
      u_fringe = random.sample(list(u_fringe), int(sample_ratio*len(u_fringe)))
      v_fringe = random.sample(list(v_fringe), int(sample_ratio*len(v_fringe)))
    if max_nodes_per_hop is not None:
      if max_nodes_per_hop < len(u_fringe):
        u_fringe = random.sample(list(u_fringe), max_nodes_per_hop)
      if max_nodes_per_hop < len(v_fringe):
        v_fringe = random.sample(list(v_fringe), max_nodes_per_hop)
    if len(u_fringe)  == 0 and len(v_fringe) == 0:
      break

    u_nodes = u_nodes + list(u_fringe)
    v_nodes = v_nodes + list(v_fringe)
    u_dist = u_dist + [dist]*len(u_fringe)
    v_dist = v_dist + [dist]*len(v_fringe)

  subgraph = Arow[u_nodes][:, v_nodes]
  # remove link between target nodes
  subgraph[0, 0] = 0

  # prepare pyg graph constructor input
  u, v, r = ssp.find(subgraph) #r is 1, 2,... (rating labels + 1)
  v += len(u_nodes)
  r = r - 1 # transform r back to rating label
  num_nodes = len(u_nodes) + len(v_nodes)
  node_labels = [x*2 for x in u_dist] + [x*2+1 for x in v_dist]
  max_node_label = 2*h + 1
  y = class_values[y]

  # get node features
  if u_features is not None:
    u_features = u_features[u_nodes]
  if v_features is not None:
    v_features = v_features[v_nodes]
  node_features = None
  if False:
    # directly use padded node features
    if u_features is not None and v_features is not None:
      u_extended = np.concatenate(
          [u_features, np.zeros([u_features.shape[0], v_features.shape[1]])], 1
      )
      v_extended = np.concatenate(
          [np.zeros([v_features.shape[0], u_features.shape[1]]), v_features], 1
      )
      node_features = np.concatenate([u_extended, v_extended], 0)
  if False:
    # use identity features (one-hot encodings of node idxes)
    u_ids = one_hot(u_nodes, Arow.shape[0] + Arow.shape[1])
    v_ids = one_hot([x+Arow.shape[0] for x in v_nodes], Arow.shape[0] + Arow.shape[1])

    node_ids = np.concatenate([u_ids, v_ids], 0)
    # node_features = np.concatenate([node_features, node_ids], 1)
    node_features = node_ids

  if True:
    # only output node features for the target user and item
    if u_features is not None and v_features is not None:
      # Convert sparse matrices to dense NumPy arrays before returning
      node_features = [u_features[0].todense(), v_features[0].todense()]

  return u, v, r, node_labels, max_node_label, y, node_features

def construct_pyg_graph(u, v, r, node_labels, max_node_label, y, node_features):
  u, v = torch.LongTensor(u), torch.LongTensor(v)
  r = torch.LongTensor(r)
  edge_index = torch.stack([torch.cat([u, v]), torch.cat([v, u])], 0)
  edge_type = torch.cat([r, r])
  x = torch.FloatTensor(one_hot(node_labels, max_node_label+1))
  y = torch.FloatTensor([y])
  data = Data(x, edge_index, edge_type = edge_type, y = y)

  if node_features is not None:
    if type(node_features) == list:
      u_feature, v_feature = node_features
      data.u_feature = torch.FloatTensor(u_feature).unsqueeze(0)
      data.v_feature = torch.FloatTensor(v_feature).unsqueeze(0)
    else:
      x2 = torch.FloatTensor(node_features)
      data.x = torch.cat([data.x, x2], 1)
  return data

def neighbors(fringe, A):
  # find all 1-hop neighbors of nodes in fringe from A
  return set(A[list(fringe)].indices)

def one_hot(idx, length):
  idx = np.array(idx)
  x = np.zeros([len(idx), length])
  x[np.arange(len(idx)), idx] = 1.0
  return x

class SparseRowIndexer:
  def __init__(self, csr_matrix):
    data = []
    indices = []
    indptr = []

    for row_start, row_end in zip(csr_matrix.indptr[:-1], csr_matrix.indptr[1:]):
      data.append(csr_matrix.data[row_start:row_end])
      indices.append(csr_matrix.indices[row_start:row_end])
      indptr.append(row_end - row_start)  #nnz of the row

    self.data = np.array(data, dtype = object)
    self.indices = np.array(indices, dtype = object)
    self.indptr = np.array(indptr, dtype = object)
    self.shape = csr_matrix.shape
  def __getitem__(self, row_selector):
    indices = np.concatenate(self.indices[row_selector])
    data = np.concatenate(self.data[row_selector])
    indptr = np.append(0, np.cumsum(self.indptr[row_selector]))
    shape = [indptr.shape[0] - 1, self.shape[1]]
    return ssp.csr_matrix((data, indices, indptr), shape = shape)

class SparseColIndexer:
  def __init__(self, csc_matrix):
    data = []
    indices = []
    indptr = []

    for col_start, col_end in zip(csc_matrix.indptr[:-1], csc_matrix.indptr[1:]):
      data.append(csc_matrix.data[col_start:col_end])
      indices.append(csc_matrix.data[col_start:col_end])
      indptr.append(col_end - col_start)

    self.data = np.array(data, dtype = object)
    self.indices = np.array(indices, dtype = object)
    self.indptr = np.array(indptr, dtype = object)
    self.shape = csc_matrix.shape

  def __getitem__(self, col_selector):
    indices = np.concatenate(self.indices[col_selector])
    data = np.concatenate(self.data[col_selector])
    indptr = np.append(0, np.cumsum(self.indptr[col_selector]))

    shape = [self.shape[0], indptr.shape[0] - 1]
    return ssp.csc_matrix((data, indices, indptr), shape = shape)


def links2subgraphs(Arow, Acol, links, labels, h=1, sample_ratio = 1.0,
                    max_nodes_per_hop = None, u_features = None, v_features = None,
                    class_values = None, parallel = True):
  # Extract enclosing subgraphs
  print('Enclosing subgraph extraction begins...')
  g_list = []
  if not parallel:
    with tqdm(total = len(links[0])) as pbar:
      for i, j, g_label in zip(links[0], links[1], labels):
        tmp = subgraph_extraction_labeling((i, j), Arow, Acol, h, sample_ratio,
                                           max_nodes_per_hop, u_features, v_features,
                                           class_values, g_label)
        data = construct_pyg_graph(*tmp)
        g_list.append(data)
        pbar.update(1)
  else:
    start = time.time()
    pool = mp.Pool(mp.cpu_count())
    results = pool.starmap_async(
        subgraph_extraction_labeling,
        [
         ((i,j), Arow, Acol, h, sample_ratio, max_nodes_per_hop, u_features,
          v_features, class_values, g_label)
         for i, j, g_label in zip(links[0], links[1], labels)
        ]
    )
    remaining = results._number_left
    pbar = tqdm(total = remaining)
    while True:
      pbar.update(remaining - results._number_left)
      if results.ready(): break
      remaining = results._number_left
      time.sleep(1)
    results = results.get()
    pool.close()
    pbar.close()
    end = time.time()
    print("Time elapsed for subgraph extraction: {}s".format(end-start))
    print("Tranforming to pytorch_geometric graphs...")
    g_list = []
    pbar = tqdm(total = len(results))
    while results:
      tmp = results.pop()
      g_list.append(construct_pyg_graph(*tmp))
      pbar.update(1)
    pbar.close()
    end2 = time.time()
    print("Time elapsed for transforming to pytorch_geometric graphs: {}s".format(end2-end))
  return g_list

class MyDataset(InMemoryDataset):
  def __init__(self, root, A, links, labels, h, sample_ratio, max_nodes_per_hop,
               u_features, v_features, class_values, max_num = None, parallel=True):
    self.Arow = SparseRowIndexer(A)
    self.Acol = SparseColIndexer(A.tocsc())
    self.links = links
    self.labels = labels
    self.h = h
    self.sample_ratio = sample_ratio
    self.max_nodes_per_hop = max_nodes_per_hop
    self.u_features = u_features
    self.v_features = v_features
    self.class_values = class_values
    self.parallel = parallel
    self.max_num = max_num
    if max_num is not None:
      np.random.seed(123)
      num_links = len(links[0])
      perm = np.random.permutation(num_links)
      perm = perm[:max_num]
      self.links = (links[0][perm], links[1][perm])
      self.labels = labels[perm]
    super(MyDataset, self).__init__(root)
    self.data, self.slices = torch.load(self.processed_paths[0])

  @property
  def processed_file_names(self):
    name = 'data.pt'
    if self.max_num is not None:
      name = 'data_{}.pt'.format(self.max_num)
    return [name]

  def process(self):
    # Extract enclosing subgraphs and save to disk
    data_list = links2subgraphs(self.Arow, self.Acol, self.links, self.labels, self.h,
                                self.sample_ratio, self.max_nodes_per_hop,
                                self.u_features, self.v_features,
                                self.class_values, self.parallel)

    data, slices = self.collate(data_list)
    torch.save((data, slices), self.processed_paths[0])
    del data_list

class MyDynamicDataset(Dataset):
  def __init__(self, root, A, links, labels, h, sample_ratio, max_nodes_per_hop,
                u_features, v_features, class_values, max_num=None):
    super(MyDynamicDataset, self).__init__(root)
    self.Arow = SparseRowIndexer(A)
    self.Acol = SparseColIndexer(A.tocsc())
    self.links = links
    self.labels = labels
    self.h = h
    self.sample_ratio = sample_ratio
    self.max_nodes_per_hop = max_nodes_per_hop
    self.u_features = u_features
    self.v_features = v_features
    self.class_values = class_values
    if max_num is not None:
      np.random.seed(123)
      num_links = len(links[0])
      perm = np.random.permutation(num_links)
      perm = perm[:max_num]
      self.links = (links[0][perm], links[1][perm])
      self.labels = labels[perm]

  def len(self):
    return len(self.links[0])

  def get(self, idx):
    i, j = self.links[0][idx], self.links[1][idx]
    g_label = self.labels[idx]
    tmp = subgraph_extraction_labeling(
      (i, j), self.Arow, self.Acol, self.h, self.sample_ratio, self.max_nodes_per_hop,
      self.u_features, self.v_features, self.class_values, g_label
    )
    return construct_pyg_graph(*tmp)

### Define Model

In [125]:
import torch.nn as nn
from torch.nn import Linear, Conv1d
from torch_geometric.nn import GCNConv, RGCNConv, global_sort_pool, global_add_pool
from torch_geometric.utils import dropout_adj
import pdb

class GNN(torch.nn.Module):
  # a base GNN class, GCN message passing + sum_pooling
  def __init__(self, dataset, gconv = GCNConv, latent_dim=[32, 32, 32, 1],
               regression = False, adj_dropout = 0.2,
               force_undirected = False):
    super(GNN, self).__init__()
    self.regression = regression
    self.adj_dropout = adj_dropout
    self.force_undirected = force_undirected
    self.convs = torch.nn.ModuleList()
    self.convs.append(gconv(4, latent_dim[0]))
    for i in range(0, len(latent_dim)-1):
      self.convs.append(gconv(latent_dim[i], latent_dim[i+1]))
    self.lin1 = Linear(sum(latent_dim), 128)
    if self.regression:
      self.lin2 = Linear(128, 1)
    else:
      self.lin2 = Linear(128, dataset.num_classes)

  def reset_parameters(self):
    for conv in self.convs:
      conv.reset_parameters()
    self.lin1.reset_parameters()
    self.lin2.reset_parameters()

  def forward(self, data):
    x, edge_index, batch = data.x, data.edge_index, data.batch
    if self.adj_dropout > 0:
      edge_index, edge_type = dropout_adj(edge_index, edge_type, p=self.adj_dropout,
                                          force_undirected=self.force_undirected,
                                          num_nodes=len(x), training=self.training
        )
    concat_states = []
    for conv in self.convs:
      x = torch.tanh(conv(x, edge_index))
      concat_states.append(x)
    concat_states = torch.cat(concat_states, 1)
    x = global_add_pool(concat_states, batch)
    x = F.relu(self.lin1(x))
    x = F.dropout(x, p=0.5, training=self.training)
    x = self.lin2(x)
    if self.regression:
      return x[:, 0]
    else:
      return F.log_softmax(x, dim=-1)

  def __repr__(self):
    return self.__class__.__name__

class IGMC(GNN):
  # The GNN model of Inductive Graph-based Matrix Completion.
  # Use RGCN convolution + center-nodes readout.
  def __init__(self, dataset, gconv=RGCNConv, latent_dim=[32, 32, 32, 32],
               num_relations=5, num_bases=2, regression=False, adj_dropout=0.2,
               force_undirected=False, side_features=False, n_side_features=0,
               multiply_by=1):
    super(IGMC, self).__init__(dataset, GCNConv, latent_dim,
                               regression, adj_dropout, force_undirected
    )
    self.multiply_by = multiply_by
    self.convs = torch.nn.ModuleList()
    self.convs.append(gconv(4, latent_dim[0], num_relations, num_bases))
    for i in range(0, len(latent_dim)-1):
      self.convs.append(gconv(latent_dim[i], latent_dim[i+1], num_relations, num_bases))
    self.lin1 = Linear(2*sum(latent_dim), 128)
    self.side_features = side_features
    if side_features:
      self.lin1 = Linear(2*sum(latent_dim)+n_side_features, 128)

  def forward(self, data):
    start = time.time()
    x, edge_index, edge_type, batch = data.x, data.edge_index, data.edge_type, data.batch
    if self.adj_dropout > 0:
      edge_index, edge_mask = dropout_edge(edge_index, p=self.adj_dropout,
                                           force_undirected=self.force_undirected,
                                           training=self.training)
      edge_type = data.edge_type[edge_mask]
    concat_states = []
    for conv in self.convs:
      x = torch.tanh(conv(x, edge_index, edge_type))
      concat_states.append(x)
    concat_states = torch.cat(concat_states, 1)

    users = data.x[:, 0] == 1
    items = data.x[:, 1] == 1
    x = torch.cat([concat_states[users], concat_states[items]], 1)
    if self.side_features:
      x = torch.cat([x, data.u_feature, data.v_feature], 1)

    x = F.relu(self.lin1(x))
    x = F.dropout(x, p=0.5, training=self.training)
    x = self.lin2(x)
    if self.regression:
      return x[:, 0] * self.multiply_by
    else:
      return F.log_softmax(x, dim=-1)

### Training utilities

In [10]:
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

In [126]:
def train(model, optimizer, loader, device, regression = False, ARR = 0, show_progress= False, epoch = None):
  model.train()
  total_loss = 0
  if show_progress:
    pbar = tqdm(loader)
  else:
    pbar = loader
  for data in pbar:
    optimizer.zero_grad()
    data = data.to(device)
    out = model(data)
    if regression:
      loss = F.mse_loss(out, data.y.view(-1))
    else:
      loss = F.nll_loss(out, data.y.view(-1))
    if show_progress:
      pbar.set_description('Epoch {}, batch loss: {}'.format(epoch, loss.item()))
    if ARR != 0:
      for gconv in model.convs:
        w = torch.matmul(gconv.comp,
                         gconv.weight.view(gconv.num_bases, -1)
        ).view(gconv.num_relations, gconv.in_channels, gconv.out_channels)
        reg_loss = torch.sum((w[1:, :, :] - w[:-1, :, :])**2)
        loss += ARR * reg_loss
    loss.backward()
    total_loss += loss.item() * num_graphs(data)
    optimizer.step()
  return total_loss / len(loader.dataset)

def eval_loss(model, loader, device, regression=False, show_progress= False):
  model.eval()
  loss = 0
  if show_progress:
    print('Testing begins ...')
    pbar = tqdm(loader)
  else:
    pbar = loader
  for data in pbar:
    data = data.to(device)
    with torch.no_grad():
      out = model(data)
    if regression:
      loss += F.mse_loss(out, data.y.view(-1), reduction='sum').item()
    else:
      loss += F.nll_loss(out, data.y.view(-1), reduction='sum').item()
    torch.cuda.empty_cache()
  return loss / len(loader.dataset)

def eval_rmse(model, loader, device, show_progress = False):
  mse_loss = eval_loss(model, loader, device, True, show_progress)
  rmse = math.sqrt(mse_loss)
  return rmse

def eval_loss_ensemble(model, checkpoints, loader, device, regression = False, show_progress = False):
  loss = 0
  Outs = []
  ys = []
  for i, checkpoint in enumerate(checkpoints):
    if show_progress:
      print('Testing begins...')
      pbar = tqdm(loader)
    else:
      pbar = loader
    model.load_state_dict(torch.load(checkpoint))
    model.eval()
    outs = []
    if i == 0:
      ys = []
    for data in pbar:
      data = data.to(device)
      if i == 0:
        ys.append(data.y.view(-1))
      with torch.no_grad():
        out = model(data)
        outs.append(out)
    if i == 0:
      ys = torch.cat(ys, 0)
    outs = torch.cat(outs, 0).view(-1 , 1)
    Outs.append(outs)
  Outs = torch.cat(Outs, 1).mean(1)
  if regression:
    loss += F.mse_loss(Outs, ys, reduction='sum').item()
  else:
    loss += F.nll_loss(Outs, ys, reduction='sum').item()

  return loss / len(loader.dataset)

def eval_rmse_ensemble(model, checkpoints, loader, device, show_progress = False):
  mse_loss = eval_loss_ensemble(model ,checkpoints, loader, device, True, show_progress = False)
  rmse = math.sqrt(mse_loss)
  return rmse

In [127]:
def train_multiple_epochs(train_dataset, test_dataset, model,
                          epochs, batch_size, lr, lr_decay_factor,
                          lr_decay_step_size, weight_decay, ARR = 0,
                          test_freq = 1, logger = None, continue_from = None,
                          res_dir = None, cpu_only = False):
  rmses = []

  if train_dataset.__class__.__name__ == 'MyDynamicDataset':
    num_workers = mp.cpu_count()
  else:
    num_workers = 2

  train_loader = DataLoader(train_dataset, batch_size, shuffle = True, num_workers = num_workers)

  if test_dataset.__class__.__name__ == 'MyDynamicDataset':
    num_workers = mp.cpu_count()
  else:
    num_workers = 2

  test_loader = DataLoader(test_dataset, batch_size, shuffle = False, num_workers = num_workers)

  model.to(device).reset_parameters()
  optimizer = Adam(model.parameters(), lr = lr, weight_decay = weight_decay)
  start_epoch = 1
  if continue_from is not None:
    if(cpu_only):
      model.load_state_dict(
        torch.load(
            os.path.join(res_dir, 'model_checkpoint{}.pth'.format(continue_from)),
            map_location=torch.device('cpu')
            )
      )
      optimizer.load_state_dict(
          torch.load(
              os.path.join(res_dir, 'optimizer_checkpoint{}.pth'.format(continue_from)),
              map_location=torch.device('cpu')
              )
      )
    else:
      model.load_state_dict(
        torch.load(os.path.join(res_dir, 'model_checkpoint{}.pth'.format(continue_from)))
      )
      optimizer.load_state_dict(
          torch.load(os.path.join(res_dir, 'optimizer_checkpoint{}.pth'.format(continue_from)))
      )

    start_epoch = continue_from + 1
    epochs -= continue_from
  if torch.cuda.is_available():
    torch.cuda.synchronize()

  # batch_pbar = len(train_dataset) >= 100000
  batch_pbar = True
  t_start = time.perf_counter()
  if not batch_pbar:
    pbar = tqdm(range(start_epoch, epochs + start_epoch))
  else:
    pbar = range(start_epoch, epochs + start_epoch)
  for epoch in pbar:
    train_loss = train(model, optimizer, train_loader, device, regression = True,
                       ARR = ARR, show_progress = batch_pbar, epoch=epoch)
    if epoch % test_freq == 0:
      rmses.append(eval_rmse(model, test_loader, device, show_progress = batch_pbar))
    else:
      rmses.append(np.nan)
    eval_info = {
      'epoch': epoch,
      'train_loss': train_loss,
      'test_rmse': rmses[-1],
    }

    if not batch_pbar:
      pbar.set_description(
          'Epoch {}, train loss {:.6f}, test rmse {:.6f}'.format(*eval_info.values())
      )
    else:
      print('Epoch {}, train loss {:.6f}, test rmse {:.6f}'.format(*eval_info.values()))

    if epoch % lr_decay_step_size == 0:
      for param_group in optimizer.param_groups:
        param_group['lr'] = lr_decay_factor * param_group['lr']

    if logger is not None:
      logger(eval_info, model, optimizer)

  t_end = time.perf_counter()
  duration = t_end - t_start

  print('Final Test RMSE: {:.6f}, Duration: {:.6f}'.format(rmses[-1], duration))

  return rmses[-1]

def test_once(test_dataset, model, batch_size, logger = None, ensemble = False, checkpoints = None):
  test_loader = DataLoader(test_dataset, batch_size, shuffle = False)
  model.to(device)
  t_start = time.perf_counter()
  if ensemble and checkpoints:
    rmse = eval_rmse_ensemble(model, checkpoints, test_loader, device, show_progress = True)
  else:
    rmse = eval_rmse(model, test_loader, device, show_progress = True)

  t_end = time.perf_counter()
  duration = t_end - t_start
  print('Test Once RMSE: {:.6f}, Duration: {:.6f}'.format(rmse, duration))
  epoch_info = 'test_once' if not ensemble else 'ensemble'
  eval_info = {
      'epoch': epoch_info,
      'train_loss': 0,
      'test_rmse': rmse,
  }

  if logger is not None:
    logger(eval_info, None, None)

  return rmse

def num_graphs(data):
  if data.batch is not None:
    return data.num_graphs
  else:
    return data.x.size(0)

### Load data and train model

In [129]:
import numpy as np
import pandas as pd
import random
import scipy.sparse as sp

def map_data(data):
  """
  Map data to proper indices in case they are not in a continues [0,N) range
  """
  uniq = list(set(data))
  id_dict = {old: new for new, old in enumerate(sorted(uniq))}
  data = np.array([id_dict[x] for x in data])
  n = len(uniq)

  return data, id_dict, n

def process_data(dataset, seed = 1234, verbose = True):
  num_users, num_items, u_nodes_ratings, v_nodes_ratings, ratings, u_features, v_features = None, None, None, None, None, None, None

  # shuffle here like cf-nade paper with python's own random class
  # make sure to convert to list, otherwise random.shuffle acts weird on it without a warning
  data_array = dataset.ratings_df.values.tolist()
  random.seed(seed)
  random.shuffle(data_array)
  data_array = np.array(data_array)

  u_nodes_ratings = data_array[:, 0].astype(np.int32)
  v_nodes_ratings = data_array[:, 1].astype(np.int32)
  ratings = data_array[:, 2].astype(np.float32)

  u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
  v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

  u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
  ratings = ratings.astype(np.float64)

  # Movie features (genres)
  genre_headers = dataset.movies_df.columns.values[6:]
  num_genres = genre_headers.shape[0]

  v_features = np.zeros((num_items, num_genres), dtype=np.float32)
  for movie_id, g_vec in zip(dataset.movies_df['movie id'].values.tolist(), dataset.movies_df[genre_headers].values.tolist()):
    # Check if movie_id was listed in ratings file and therefore in mapping dictionary
    if movie_id in v_dict.keys():
      v_features[v_dict[movie_id], :] = g_vec

  # User features
  sep = r'|'
  occupation = set(dataset.users_df['occupation'].values.tolist())

  gender_dict = {'M': 0., 'F': 1.}
  occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

  num_feats = 2 + len(occupation_dict)

  u_features = np.zeros((num_users, num_feats), dtype=np.float32)
  for _, row in dataset.users_df.iterrows():
    u_id = row['user id']
    if u_id in u_dict.keys():
      # age
      u_features[u_dict[u_id], 0] = row['age']
      # gender
      u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
      # occupation
      u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.

  u_features = sp.csr_matrix(u_features)
  v_features = sp.csr_matrix(v_features)

  return num_users, num_items, u_nodes_ratings, v_nodes_ratings, ratings, u_features, v_features

In [130]:
def create_trainvaltest_split(dataset, seed=1212, testing=False,
               verbose=True, rating_map=None,
               post_rating_map=None, ratio=1.0):
  num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features = process_data(dataset, seed=seed, verbose=verbose)

  if rating_map is not None:
    for i, x in enumerate(ratings):
      ratings[i] = rating_map[x]

  rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

  # number of test and validation edges

  print("Spliting the dataset ...")
  num_test = int(np.ceil(ratings.shape[0] * 0.1))
  num_val = int(np.ceil(ratings.shape[0] * 0.9 * 0.05))
  num_train = ratings.shape[0] - num_val - num_test

  pairs_nonzero = np.vstack([u_nodes, v_nodes]).transpose()

  train_pairs_idx = pairs_nonzero[0:int(num_train*ratio)]
  val_pairs_idx = pairs_nonzero[num_train:num_train + num_val]
  test_pairs_idx = pairs_nonzero[num_train + num_val:]

  u_test_idx, v_test_idx = test_pairs_idx.transpose()
  u_val_idx, v_val_idx = val_pairs_idx.transpose()
  u_train_idx, v_train_idx = train_pairs_idx.transpose()

  # create labels
  all_labels = np.array([rating_dict[r] for r in ratings], dtype=np.int32)
  train_labels = all_labels[0:int(num_train*ratio)]
  val_labels = all_labels[num_train:num_train + num_val]
  test_labels = all_labels[num_train + num_val:]

  if testing:
    u_train_idx = np.hstack([u_train_idx, u_val_idx])
    v_train_idx = np.hstack([v_train_idx, v_val_idx])
    train_labels = np.hstack([train_labels, val_labels])

  class_values = np.sort(np.unique(ratings))

  # make training adjacency matrix
  if post_rating_map is None:
    data = train_labels + 1.
  else:
    data = np.array([post_rating_map[r] for r in class_values[train_labels]]) + 1.
  data = data.astype(np.float32)

  rating_mx_train = sp.csr_matrix((data, [u_train_idx, v_train_idx]),
                                  shape=[num_users, num_items], dtype=np.float32)

  return u_features, v_features, rating_mx_train, train_labels, u_train_idx, v_train_idx, val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values

In [135]:
u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, test_v_indices, class_values = create_trainvaltest_split(dataset, 1212, False, True, None, None, 1.0)

Spliting the dataset ...


In [16]:
!mkdir ml-1m ml-1m/train ml-1m/test ml-1m/val

In [119]:
dataset_class = 'MyDynamicDataset'
train_indices = (train_u_indices, train_v_indices)
val_indices = (val_u_indices, val_v_indices)
test_indices = (test_u_indices, test_v_indices)
train_graphs = eval(dataset_class)(
    'ml-1m/train',
    adj_train,
    train_indices,
    train_labels,
    1,
    1.0,
    100,
    u_features,
    v_features,
    class_values,
    max_num=None
)

test_graphs = eval(dataset_class)(
    'ml-1m/test',
    adj_train,
    test_indices,
    test_labels,
    1,
    1.0,
    100,
    u_features,
    v_features,
    class_values,
    max_num= None
)

val_graphs = eval(dataset_class)(
    'ml-1m/val',
    adj_train,
    val_indices,
    val_labels,
    1,
    1.0,
    100,
    u_features,
    v_features,
    class_values,
    max_num= None
)

In [137]:
train_dataset = eval('MyDynamicDataset')(root='ml-1m/train', A=adj_train,
    links=(train_u_indices, train_v_indices), labels=train_labels, h=1, sample_ratio=1.0,
    max_nodes_per_hop=200, u_features=None, v_features=None, class_values=class_values)
val_dataset = eval('MyDynamicDataset')(root='ml-1m/val', A=adj_train,
    links=(val_u_indices, val_v_indices), labels=val_labels, h=1, sample_ratio=1.0,
    max_nodes_per_hop=200, u_features=None, v_features=None, class_values=class_values)
test_dataset = eval('MyDynamicDataset')(root='ml-1m/test', A=adj_train,
    links=(test_u_indices, test_v_indices), labels=test_labels, h=1, sample_ratio=1.0,
    max_nodes_per_hop=200, u_features=None, v_features=None, class_values=class_values)

len(train_dataset), len(val_dataset), len(test_dataset)

(85500, 4500, 10000)

In [138]:
num_relations = len(class_values)
multiply_by = 1

model = IGMC(train_graphs, latent_dim = [32, 32, 32, 32],
             num_relations = num_relations, num_bases = 4,
             regression = True, adj_dropout = 0.2, force_undirected = False,
             side_features = False, n_side_features = 0, multiply_by = 1
)

In [140]:
def logger(info, model, optimizer):
  epoch, train_loss, test_rmse = info['epoch'], info['train_loss'], info['test_rmse']
  with open(os.path.join('ml-1m/train/', 'log.txt'), 'a') as f:
    f.write('Epoch {}, train loss {:.4f}, test rmse {:.6f}\n'.format(epoch, train_loss, test_rmse))
  if type(epoch) == int and epoch % args['save_interval'] == 0:
    print('Saving model states...')
    model_name = os.path.join('ml-1m/train/', 'model_checkpoint{}.pth'.format(epoch))
    optimizer_name = os.path.join(
        'ml-1m/train/', 'optimizer_checkpoint{}.pth'.format(epoch)
    )
    if model is not None:
      torch.save(model.state_dict(), model_name)
    if optimizer is not None:
      torch.save(optimizer.state_dict(), optimizer_name)

In [141]:
args = {
    'save_interval': 1
}
train_multiple_epochs(
        train_dataset,
        val_dataset,
        model,
        4,
        50,
        1e-3,
        lr_decay_factor = 0.1 ,
        lr_decay_step_size = 20,
        weight_decay = 0,
        ARR = 0.001,
        test_freq = 1,
        logger = logger,
        continue_from = None,
        res_dir= 'ml-1m/train',
        cpu_only = True
    )

Epoch 1, batch loss: 1.214111566543579: 100%|██████████| 1710/1710 [10:11<00:00,  2.79it/s]


Testing begins ...


  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
100%|██████████| 90/90 [00:25<00:00,  3.54it/s]


Epoch 1, train loss 1.455307, test rmse 0.963161
Saving model states...


Epoch 2, batch loss: 0.6509255170822144: 100%|██████████| 1710/1710 [10:17<00:00,  2.77it/s]


Testing begins ...


  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
100%|██████████| 90/90 [00:25<00:00,  3.51it/s]


Epoch 2, train loss 1.155056, test rmse 1.001294
Saving model states...


Epoch 3, batch loss: 1.1597026586532593: 100%|██████████| 1710/1710 [10:03<00:00,  2.83it/s]


Testing begins ...


  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
100%|██████████| 90/90 [00:26<00:00,  3.44it/s]


Epoch 3, train loss 1.069708, test rmse 0.982991
Saving model states...


Epoch 4, batch loss: 1.1350752115249634: 100%|██████████| 1710/1710 [10:04<00:00,  2.83it/s]


Testing begins ...


  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
100%|██████████| 90/90 [00:25<00:00,  3.48it/s]

Epoch 4, train loss 1.015289, test rmse 0.946835
Saving model states...
Final Test RMSE: 0.946835, Duration: 2540.536522





0.9468349097745234

 # Evaluation

In [None]:
def process_data(dataset, seed = 1234, verbose = True):
  num_users, num_items, u_nodes_ratings, v_nodes_ratings, ratings, u_features, v_features = None, None, None, None, None, None, None

  # shuffle here like cf-nade paper with python's own random class
  # make sure to convert to list, otherwise random.shuffle acts weird on it without a warning
  data_array = dataset.ratings_df.values.tolist()
  random.seed(seed)
  random.shuffle(data_array)
  data_array = np.array(data_array)

  u_nodes_ratings = data_array[:, 0].astype(np.int32)
  v_nodes_ratings = data_array[:, 1].astype(np.int32)
  ratings = data_array[:, 2].astype(np.float32)

  u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
  v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

  u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
  ratings = ratings.astype(np.float64)

  # Movie features (genres)
  genre_headers = dataset.movies_df.columns.values[6:]
  num_genres = genre_headers.shape[0]

  v_features = np.zeros((num_items, num_genres), dtype=np.float32)
  for movie_id, g_vec in zip(dataset.movies_df['movie id'].values.tolist(), dataset.movies_df[genre_headers].values.tolist()):
    # Check if movie_id was listed in ratings file and therefore in mapping dictionary
    if movie_id in v_dict.keys():
      v_features[v_dict[movie_id], :] = g_vec

  # User features
  sep = r'|'
  occupation = set(dataset.users_df['occupation'].values.tolist())

  gender_dict = {'M': 0., 'F': 1.}
  occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

  num_feats = 2 + len(occupation_dict)

  u_features = np.zeros((num_users, num_feats), dtype=np.float32)
  for _, row in dataset.users_df.iterrows():
    u_id = row['user id']
    if u_id in u_dict.keys():
      # age
      u_features[u_dict[u_id], 0] = row['age']
      # gender
      u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
      # occupation
      u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.

  u_features = sp.csr_matrix(u_features)
  v_features = sp.csr_matrix(v_features)

  return num_users, num_items, u_nodes_ratings, v_nodes_ratings, ratings, u_features, v_features

In [None]:
test_once(test_graphs, model, batch_size=10)



Testing begins ...


  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intX

Test Once RMSE: 1.039139, Duration: 419.114947





1.0391391149251126

In [142]:
model.to(device)

IGMC

In [152]:
ppp = DataLoader(test_graphs, batch_size=50)



In [162]:
import numpy as np
def get_user_recommendations(user_id, model, dataset, adj_matrix, u_features, v_features, class_values, h=1, sample_ratio=1.0, max_nodes_per_hop=100):
  """
  Get movie recommendations for a specific user.

  Args:
    user_id (int): The original user ID (before mapping).
    model: The trained IGMC model.
    dataset: The MovieLens dataset object.
    adj_matrix (scipy.sparse.csr_matrix): The training adjacency matrix.
    u_features (scipy.sparse.csr_matrix): User features matrix.
    v_features (scipy.sparse.csr_matrix): Movie features matrix.
    class_values (numpy.ndarray): Array of possible rating values.
    h (int): Number of hops for subgraph extraction.
    sample_ratio (float): Sampling ratio for subgraph nodes.
    max_nodes_per_hop (int): Maximum number of nodes per hop.

  Returns:
    list: A list of recommended movie titles.
  """
  # Map the original user ID to the internal ID used in the model
  data_array = dataset.ratings_df.values.tolist()
  u_nodes_ratings = np.array(data_array)[:, 0].astype(np.int32)
  _, u_dict, _ = map_data(u_nodes_ratings)

  if user_id not in u_dict:
    print(f"User ID {user_id} not found in the dataset.")
    return []

  internal_user_id = u_dict[user_id]

  # Get all movies the user has NOT rated
  rated_movies = dataset.ratings_df[dataset.ratings_df['u_nodes'] == user_id]['v_nodes'].values.tolist()
  all_movie_ids = dataset.movies_df['movie id'].values.tolist()
  unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movies]

  # Map original unrated movie IDs to internal IDs
  data_array = dataset.ratings_df.values.tolist()
  v_nodes_ratings = np.array(data_array)[:, 1].astype(np.int32)
  _, v_dict, _ = map_data(v_nodes_ratings)

  internal_unrated_movie_ids = [v_dict[movie_id] for movie_id in unrated_movie_ids if movie_id in v_dict]
  original_unrated_movie_ids = [movie_id for movie_id in unrated_movie_ids if movie_id in v_dict] # Keep track of original IDs

  if not internal_unrated_movie_ids:
    print(f"User ID {user_id} has rated all movies or no unrated movies found in the dataset.")
    return []

  # Create link indices and dummy labels for the unrated movies
  links_to_predict = (np.array([internal_user_id] * len(internal_unrated_movie_ids)), np.array(internal_unrated_movie_ids))
  dummy_labels = np.zeros(len(internal_unrated_movie_ids), dtype=np.int32) # Labels don't matter for prediction

  # Create a dynamic dataset for the links to predict
  predict_dataset = MyDynamicDataset(
      root='ml-1m/predict',
      A=adj_matrix,
      links=links_to_predict,
      labels=dummy_labels,
      h=h,
      sample_ratio=sample_ratio,
      max_nodes_per_hop=max_nodes_per_hop,
      u_features=u_features,
      v_features=v_features,
      class_values=class_values,
      max_num=None
  )

  predict_loader = DataLoader(test_dataset, batch_size=50, shuffle=False, num_workers=2)

  model.eval()
  predictions = []
  with torch.no_grad():
    for data in predict_loader:
      data = data.to(device)
      out = model(data)
      predictions.extend(out.cpu().numpy())

  # Get the predicted ratings and sort movies by predicted rating
  predicted_ratings = np.array(predictions)
  sorted_movie_indices = np.argsort(predicted_ratings)[::-1] # Descending order

  # Get the top N recommended movie original IDs
  top_n = 10 # You can adjust this
  top_movie_internal_ids = np.array(internal_unrated_movie_ids)[sorted_movie_indices[:top_n]]
  top_movie_original_ids = [original_unrated_movie_ids[internal_unrated_movie_ids.index(internal_id)] for internal_id in top_movie_internal_ids]

  # Get the movie titles
  recommended_movie_titles = dataset.movies_df[dataset.movies_df['movie id'].isin(top_movie_original_ids)]['movie title'].values.tolist()

  return recommended_movie_titles

# Example usage: Get recommendations for user with original ID 1
user_id_to_recommend = 1
recommendations = get_user_recommendations(user_id_to_recommend, model, dataset, adj_train, u_features, v_features, class_values)

print(f"Recommendations for user {user_id_to_recommend}:")
for movie_title in recommendations:
  print(f"- {movie_title}")


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch_geometric/data/dataset.py", line 291, in __getitem__
    data = self.get(self.indices()[idx])
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-100-3201907978.py", line 147, in get
    return construct_pyg_graph(*tmp)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-100-3201907978.py", line 294, in construct_pyg_graph
    data.u_feature = torch.FloatTensor(u_feature).unsqueeze(0)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/scipy/sparse/_base.py", line 425, in __len__
    raise TypeError("sparse array length is ambiguous; use getnnz()"
TypeError: sparse array length is ambiguous; use getnnz() or shape[0]


# Dump and save the model

In [None]:
import joblib

joblib.dump(model, 'model.pkl')

In [166]:
# Example usage: Get recommendations for user with original ID 1
user_id_to_recommend = 2
recommendations = get_user_recommendations(user_id_to_recommend, model, dataset, adj_train, u_features, v_features, class_values)

print(f"Recommendations for user {user_id_to_recommend}:")
for movie_title in recommendations:
  print(f"- {movie_title}")

  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])


Recommendations for user 2:
- 2001: A Space Odyssey (1968)
- Fish Called Wanda, A (1988)
- Monty Python and the Holy Grail (1974)
- Wrong Trousers, The (1993)
- Princess Bride, The (1987)
- Raiders of the Lost Ark (1981)
- Psycho (1960)
- Blues Brothers, The (1980)
- This Is Spinal Tap (1984)
- Close Shave, A (1995)


In [171]:
movie1 = dataset.movies_df[dataset.movies_df['movie id'] == 1]

In [176]:
# prompt: get first movie by id and return dict

movie1.to_dict('records')[0]

{'movie id': 1,
 'movie title': 'Toy Story (1995)',
 'release date': '01-Jan-1995',
 'video release date': nan,
 'IMDb URL': 'http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)',
 'unknown': 0,
 'Action': 0,
 'Adventure': 0,
 'Animation': 1,
 'Childrens': 1,
 'Comedy': 1,
 'Crime': 0,
 'Documentary': 0,
 'Drama': 0,
 'Fantasy': 0,
 'Film-Noir': 0,
 'Horror': 0,
 'Musical': 0,
 'Mystery': 0,
 'Romance': 0,
 'Sci-Fi': 0,
 'Thriller': 0,
 'War': 0,
 'Western': 0}

# Task
Explain how to get recommendations using the trained model and necessary data components (`adj_train`, `u_features`, `v_features`) in a new Python environment, assuming these components are not directly available. Provide example Python code to demonstrate the process.

## Save necessary components

### Subtask:
Save the trained model, the training adjacency matrix, user features, item features, the MovieLens dataset object, and class values to files using appropriate serialization methods.


**Reasoning**:
Save the trained model, adjacency matrix, user features, item features, dataset object, and class values to files.



In [177]:
import joblib
import scipy.sparse as sp
import numpy as np

# Save the trained model
joblib.dump(model, 'trained_model.pkl')

# Save the training adjacency matrix
sp.save_npz('adj_train.npz', adj_train)

# Save the user features
sp.save_npz('u_features.npz', u_features)

# Save the item features
sp.save_npz('v_features.npz', v_features)

# Save the dataset object
joblib.dump(dataset, 'dataset.pkl')

# Save the class values
np.save('class_values.npy', class_values)

## Load components and get recommendations in a new environment

### Subtask:
Provide example code demonstrating how to load the saved components in a separate Python script or notebook and then use the `get_user_recommendations` function with these loaded components to get recommendations.


**Reasoning**:
The next step is to provide the example code for loading the saved components and using the `get_user_recommendations` function in a new environment. This involves importing necessary libraries and the custom functions/classes, loading the saved files, defining the recommendation function, and then calling it.



**Reasoning**:
The previous code block failed due to an `IndentationError` in the `SparseColIndexer` class definition. I need to correct the indentation in the `__getitem__` method of this class. I will regenerate the entire code block with the corrected indentation.



In [182]:
import joblib
import scipy.sparse as sp
import numpy as np
import torch
from torch_geometric.data import Data, Dataset, InMemoryDataset, DataLoader
import torch.nn as nn
from torch.nn import Linear
from torch_geometric.nn import GCNConv, RGCNConv, global_sort_pool, global_add_pool
from torch_geometric.utils import dropout_adj, dropout_edge
import torch.nn.functional as F
import time
import random
import os
import multiprocessing as mp
from tqdm import tqdm


# Define the custom classes and functions from the original notebook
def subgraph_extraction_labeling(ind, Arow, Acol, h=1, sample_ratio=1.0,
                                 max_nodes_per_hop = None, u_features = None,
                                 v_features = None, class_values = None, y =1):
  # extract the h-hop enclosing subgraph around link 'ind'
  u_nodes, v_nodes = [ind[0]], [ind[1]]
  u_dist, v_dist = [0], [0]
  u_visited, v_visited = set([ind[0]]), set([ind[1]])
  u_fringe, v_fringe = set([ind[0]]), set([ind[1]])
  for dist in range(1, h+1):
    v_fringe, u_fringe = neighbors(u_fringe, Arow), neighbors(v_fringe, Acol)
    u_fringe = u_fringe - u_visited
    v_fringe = v_fringe - v_visited
    u_visited = u_visited.union(u_fringe)
    v_visited = v_visited.union(v_fringe)
    if sample_ratio < 1.0:
      u_fringe = random.sample(list(u_fringe), int(sample_ratio*len(u_fringe)))
      v_fringe = random.sample(list(v_fringe), int(sample_ratio*len(v_fringe)))
    if max_nodes_per_hop is not None:
      if max_nodes_per_hop < len(u_fringe):
        u_fringe = random.sample(list(u_fringe), max_nodes_per_hop)
      if max_nodes_per_hop < len(v_fringe):
        v_fringe = random.sample(list(v_fringe), max_nodes_per_hop)
    if len(u_fringe)  == 0 and len(v_fringe) == 0:
      break

    u_nodes = u_nodes + list(u_fringe)
    v_nodes = v_nodes + list(v_fringe)
    u_dist = u_dist + [dist]*len(u_fringe)
    v_dist = v_dist + [dist]*len(v_fringe)

  subgraph = Arow[u_nodes][:, v_nodes]
  # remove link between target nodes
  subgraph[0, 0] = 0

  # prepare pyg graph constructor input
  u, v, r = ssp.find(subgraph) #r is 1, 2,... (rating labels + 1)
  v += len(u_nodes)
  r = r - 1 # transform r back to rating label
  num_nodes = len(u_nodes) + len(v_nodes)
  node_labels = [x*2 for x in u_dist] + [x*2+1 for x in v_dist]
  max_node_label = 2*h + 1
  y = class_values[y]

  # get node features
  if u_features is not None:
    u_features = u_features[u_nodes]
  if v_features is not None:
    v_features = v_features[v_nodes]
  node_features = None

  if True:
    # only output node features for the target user and item
    if u_features is not None and v_features is not None:
      # Convert sparse matrices to dense NumPy arrays before returning
      node_features = [u_features[0].todense(), v_features[0].todense()]

  return u, v, r, node_labels, max_node_label, y, node_features

def construct_pyg_graph(u, v, r, node_labels, max_node_label, y, node_features):
  u, v = torch.LongTensor(u), torch.LongTensor(v)
  r = torch.LongTensor(r)
  edge_index = torch.stack([torch.cat([u, v]), torch.cat([v, u])], 0)
  edge_type = torch.cat([r, r])
  x = torch.FloatTensor(one_hot(node_labels, max_node_label+1))
  y = torch.FloatTensor([y])
  data = Data(x, edge_index, edge_type = edge_type, y = y)

  if node_features is not None:
    if type(node_features) == list:
      u_feature, v_feature = node_features
      data.u_feature = torch.FloatTensor(u_feature).unsqueeze(0)
      data.v_feature = torch.FloatTensor(v_feature).unsqueeze(0)
    else:
      x2 = torch.FloatTensor(node_features)
      data.x = torch.cat([data.x, x2], 1)
  return data

def neighbors(fringe, A):
  # find all 1-hop neighbors of nodes in fringe from A
  return set(A[list(fringe)].indices)

def one_hot(idx, length):
  idx = np.array(idx)
  x = np.zeros([len(idx), length])
  x[np.arange(len(idx)), idx] = 1.0
  return x

class SparseRowIndexer:
  def __init__(self, csr_matrix):
    data = []
    indices = []
    indptr = []

    for row_start, row_end in zip(csr_matrix.indptr[:-1], csr_matrix.indptr[1:]):
      data.append(csr_matrix.data[row_start:row_end])
      indices.append(csr_matrix.indices[row_start:row_end])
      indptr.append(row_end - row_start)  #nnz of the row

    self.data = np.array(data, dtype = object)
    self.indices = np.array(indices, dtype = object)
    self.indptr = np.array(indptr, dtype = object)
    self.shape = csr_matrix.shape
  def __getitem__(self, row_selector):
    indices = np.concatenate(self.indices[row_selector])
    data = np.concatenate(self.data[row_selector])
    indptr = np.append(0, np.cumsum(self.indptr[row_selector]))
    shape = [indptr.shape[0] - 1, self.shape[1]]
    return ssp.csr_matrix((data, indices, indptr), shape = shape)

class SparseColIndexer:
  def __init__(self, csc_matrix):
    data = []
    indices = []
    indptr = []

    for col_start, col_end in zip(csc_matrix.indptr[:-1], csc_matrix.indptr[1:]):
      data.append(csc_matrix.data[col_start:col_end])
      indices.append(csc_matrix.data[col_start:col_end])
      indptr.append(col_end - col_start)

    self.data = np.array(data, dtype = object)
    self.indices = np.array(indices, dtype = object)
    self.indptr = np.array(indptr, dtype = object)
    self.shape = csc_matrix.shape

  def __getitem__(self, col_selector):
    indices = np.concatenate(self.indices[col_selector])
    data = np.concatenate(self.data[col_selector])
    indptr = np.append(0, np.cumsum(self.indptr[col_selector]))

    shape = [self.shape[0], indptr.shape[0] - 1]
    return ssp.csc_matrix((data, indices, indptr), shape = shape)


def links2subgraphs(Arow, Acol, links, labels, h=1, sample_ratio = 1.0,
                    max_nodes_per_hop = None, u_features = None, v_features = None,
                    class_values = None, parallel = True):
  # Extract enclosing subgraphs
  print('Enclosing subgraph extraction begins...')
  g_list = []
  if not parallel:
    with tqdm(total = len(links[0])) as pbar:
      for i, j, g_label in zip(links[0], links[1], labels):
        tmp = subgraph_extraction_labeling((i, j), Arow, Acol, h, sample_ratio,
                                           max_nodes_per_hop, u_features, v_features,
                                           class_values, g_label)
        data = construct_pyg_graph(*tmp)
        g_list.append(data)
        pbar.update(1)
  else:
    start = time.time()
    pool = mp.Pool(mp.cpu_count())
    results = pool.starmap_async(
        subgraph_extraction_labeling,
        [
         ((i,j), Arow, Acol, h, sample_ratio, max_nodes_per_hop, u_features,
          v_features, class_values, g_label)
         for i, j, g_label in zip(links[0], links[1], labels)
        ]
    )
    remaining = results._number_left
    pbar = tqdm(total = remaining)
    while True:
      pbar.update(remaining - results._number_left)
      if results.ready(): break
      remaining = results._number_left
      time.sleep(1)
    results = results.get()
    pool.close()
    pbar.close()
    end = time.time()
    print("Time elapsed for subgraph extraction: {}s".format(end-start))
    print("Tranforming to pytorch_geometric graphs...")
    g_list = []
    pbar = tqdm(total = len(results))
    while results:
      tmp = results.pop()
      g_list.append(construct_pyg_graph(*tmp))
      pbar.update(1)
    pbar.close()
    end2 = time.time()
    print("Time elapsed for transforming to pytorch_geometric graphs: {}s".format(end2-end))
  return g_list


class MyDynamicDataset(torch.utils.data.Dataset):
  def __init__(self, root, A, links, labels, h, sample_ratio, max_nodes_per_hop,
                u_features, v_features, class_values, max_num=None):
    super(MyDynamicDataset, self).__init__()
    self.Arow = SparseRowIndexer(A)
    self.Acol = SparseColIndexer(A.tocsc())
    self.links = links
    self.labels = labels
    self.h = h
    self.sample_ratio = sample_ratio
    self.max_nodes_per_hop = max_nodes_per_hop
    self.u_features = u_features
    self.v_features = v_features
    self.class_values = class_values
    if max_num is not None:
      np.random.seed(123)
      num_links = len(links[0])
      perm = np.random.permutation(num_links)
      perm = perm[:max_num]
      self.links = (links[0][perm], links[1][perm])
      self.labels = labels[perm]

  def __len__(self):
    return len(self.links[0])

  def __getitem__(self, idx):
    i, j = self.links[0][idx], self.links[1][idx]
    g_label = self.labels[idx]
    tmp = subgraph_extraction_labeling(
      (i, j), self.Arow, self.Acol, self.h, self.sample_ratio, self.max_nodes_per_hop,
      self.u_features, self.v_features, self.class_values, g_label
    )
    return construct_pyg_graph(*tmp)

class GNN(torch.nn.Module):
  # a base GNN class, GCN message passing + sum_pooling
  def __init__(self, dataset, gconv = GCNConv, latent_dim=[32, 32, 32, 1],
               regression = False, adj_dropout = 0.2,
               force_undirected = False):
    super(GNN, self).__init__()
    self.regression = regression
    self.adj_dropout = adj_dropout
    self.force_undirected = force_undirected
    self.convs = torch.nn.ModuleList()
    self.convs.append(gconv(4, latent_dim[0]))
    for i in range(0, len(latent_dim)-1):
      self.convs.append(gconv(latent_dim[i], latent_dim[i+1]))
    self.lin1 = Linear(sum(latent_dim), 128)
    if self.regression:
      self.lin2 = Linear(128, 1)
    else:
      self.lin2 = Linear(128, dataset.num_classes)

  def reset_parameters(self):
    for conv in self.convs:
      conv.reset_parameters()
    self.lin1.reset_parameters()
    self.lin2.reset_parameters()

  def forward(self, data):
    x, edge_index, batch = data.x, data.edge_index, data.batch
    if self.adj_dropout > 0:
      edge_index, edge_type = dropout_adj(edge_index, edge_type, p=self.adj_dropout,
                                          force_undirected=self.force_undirected,
                                          num_nodes=len(x), training=self.training
        )
    concat_states = []
    for conv in self.convs:
      x = torch.tanh(conv(x, edge_index))
      concat_states.append(x)
    concat_states = torch.cat(concat_states, 1)
    x = global_add_pool(concat_states, batch)
    x = F.relu(self.lin1(x))
    x = F.dropout(x, p=0.5, training=self.training)
    x = self.lin2(x)
    if self.regression:
      return x[:, 0]
    else:
      return F.log_softmax(x, dim=-1)

  def __repr__(self):
    return self.__class__.__name__


class IGMC(GNN):
  # The GNN model of Inductive Graph-based Matrix Completion.
  # Use RGCN convolution + center-nodes readout.
  def __init__(self, dataset, gconv=RGCNConv, latent_dim=[32, 32, 32, 32],
               num_relations=5, num_bases=2, regression=False, adj_dropout=0.2,
               force_undirected=False, side_features=False, n_side_features=0,
               multiply_by=1):
    super(IGMC, self).__init__(dataset, GCNConv, latent_dim,
                               regression, adj_dropout, force_undirected
    )
    self.multiply_by = multiply_by
    self.convs = torch.nn.ModuleList()
    self.convs.append(gconv(4, latent_dim[0], num_relations, num_bases))
    for i in range(0, len(latent_dim)-1):
      self.convs.append(gconv(latent_dim[i], latent_dim[i+1], num_relations, num_bases))
    self.lin1 = Linear(2*sum(latent_dim), 128)
    self.side_features = side_features
    if side_features:
      self.lin1 = Linear(2*sum(latent_dim)+n_side_features, 128)

  def forward(self, data):
    x, edge_index, edge_type, batch = data.x, data.edge_index, data.edge_type, data.batch
    if self.adj_dropout > 0:
      edge_index, edge_mask = dropout_edge(edge_index, p=self.adj_dropout,
                                           force_undirected=self.force_undirected,
                                           training=self.training)
      edge_type = data.edge_type[edge_mask]
    concat_states = []
    for conv in self.convs:
      x = torch.tanh(conv(x, edge_index, edge_type))
      concat_states.append(x)
    concat_states = torch.cat(concat_states, 1)

    users = data.x[:, 0] == 1
    items = data.x[:, 1] == 1
    x = torch.cat([concat_states[users], concat_states[items]], 1)
    if self.side_features:
      x = torch.cat([x, data.u_feature, data.v_feature], 1)

    x = F.relu(self.lin1(x))
    x = F.dropout(x, p=0.5, training=self.training)
    x = self.lin2(x)
    if self.regression:
      return x[:, 0] * self.multiply_by
    else:
      return F.log_softmax(x, dim=-1)

def map_data(data):
  """
  Map data to proper indices in case they are not in a continues [0,N) range
  """
  uniq = list(set(data))
  id_dict = {old: new for new, old in enumerate(sorted(uniq))}
  data = np.array([id_dict[x] for x in data])
  n = len(uniq)

  return data, id_dict, n

# Define the get_user_recommendations function
def get_user_recommendations(user_id, model, dataset, adj_matrix, u_features, v_features, class_values, h=1, sample_ratio=1.0, max_nodes_per_hop=100):
  """
  Get movie recommendations for a specific user.

  Args:
    user_id (int): The original user ID (before mapping).
    model: The trained IGMC model.
    dataset: The MovieLens dataset object.
    adj_matrix (scipy.sparse.csr_matrix): The training adjacency matrix.
    u_features (scipy.sparse.csr_matrix): User features matrix.
    v_features (scipy.sparse.csr_matrix): Movie features matrix.
    class_values (numpy.ndarray): Array of possible rating values.
    h (int): Number of hops for subgraph extraction.
    sample_ratio (float): Sampling ratio for subgraph nodes.
    max_nodes_per_hop (int): Maximum number of nodes per hop.

  Returns:
    list: A list of recommended movie titles.
  """
  # Map the original user ID to the internal ID used in the model
  data_array = dataset.ratings_df.values.tolist()
  u_nodes_ratings = np.array(data_array)[:, 0].astype(np.int32)
  _, u_dict, _ = map_data(u_nodes_ratings)

  if user_id not in u_dict:
    print(f"User ID {user_id} not found in the dataset.")
    return []

  internal_user_id = u_dict[user_id]

  # Get all movies the user has NOT rated
  rated_movies = dataset.ratings_df[dataset.ratings_df['u_nodes'] == user_id]['v_nodes'].values.tolist()
  all_movie_ids = dataset.movies_df['movie id'].values.tolist()
  unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movies]

  # Map original unrated movie IDs to internal IDs
  data_array = dataset.ratings_df.values.tolist()
  v_nodes_ratings = np.array(data_array)[:, 1].astype(np.int32)
  _, v_dict, _ = map_data(v_nodes_ratings)

  internal_unrated_movie_ids = [v_dict[movie_id] for movie_id in unrated_movie_ids if movie_id in v_dict]
  original_unrated_movie_ids = [movie_id for movie_id in unrated_movie_ids if movie_id in v_dict] # Keep track of original IDs


  if not internal_unrated_movie_ids:
    print(f"User ID {user_id} has rated all movies or no unrated movies found in the dataset.")
    return []

  # Create link indices and dummy labels for the unrated movies
  links_to_predict = (np.array([internal_user_id] * len(internal_unrated_movie_ids)), np.array(internal_unrated_movie_ids))
  dummy_labels = np.zeros(len(internal_unrated_movie_ids), dtype=np.int32) # Labels don't matter for prediction

  # Create a dynamic dataset for the links to predict
  # Create a dummy root directory for the dynamic dataset
  dummy_root = 'ml-1m/predict_temp'
  os.makedirs(dummy_root, exist_ok=True)

  predict_dataset = MyDynamicDataset(
      root=dummy_root,
      A=adj_matrix,
      links=links_to_predict,
      labels=dummy_labels,
      h=h,
      sample_ratio=sample_ratio,
      max_nodes_per_hop=max_nodes_per_hop,
      u_features=u_features,
      v_features=v_features,
      class_values=class_values,
      max_num=None
  )

  predict_loader = DataLoader(predict_dataset, batch_size=50, shuffle=False, num_workers=2)

  model.eval()
  predictions = []
  with torch.no_grad():
    for data in predict_loader:
      data = data.to(device)
      out = model(data)
      predictions.extend(out.cpu().numpy())

  # Get the predicted ratings and sort movies by predicted rating
  predicted_ratings = np.array(predictions)
  sorted_movie_indices = np.argsort(predicted_ratings)[::-1] # Descending order

  # Get the top N recommended movie original IDs
  top_n = 10 # You can adjust this
  top_movie_internal_ids = np.array(internal_unrated_movie_ids)[sorted_movie_indices[:top_n]]
  top_movie_original_ids = [original_unrated_movie_ids[internal_unrated_movie_ids.index(internal_id)] for internal_id in top_movie_internal_ids]

  # Get the movie titles
  recommended_movie_titles = loaded_dataset.movies_df[loaded_dataset.movies_df['movie id'].isin(top_movie_original_ids)]['movie title'].values.tolist()

  return recommended_movie_titles


# Load the saved components
try:
  loaded_model = joblib.load('trained_model.pkl')
  loaded_adj_train = sp.load_npz('adj_train.npz')
  loaded_u_features = sp.load_npz('u_features.npz')
  loaded_v_features = sp.load_npz('v_features.npz')
  loaded_dataset = joblib.load('dataset.pkl')
  loaded_class_values = np.load('class_values.npy')

  # Set the device
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  loaded_model.to(device)

  # Example usage: Get recommendations for user with original ID 1
  user_id_to_recommend = 1
  recommendations = get_user_recommendations(
      user_id_to_recommend,
      loaded_model,
      loaded_dataset,
      loaded_adj_train,
      loaded_u_features,
      loaded_v_features,
      loaded_class_values
  )

  print(f"Recommendations for user {user_id_to_recommend}:")
  for movie_title in recommendations:
    print(f"- {movie_title}")

except FileNotFoundError as e:
  print(f"Error loading file: {e}. Please ensure all necessary files have been saved.")
except Exception as e:
  print(f"An error occurred: {e}")

Recommendations for user 1:
- L.A. Confidential (1997)
- Close Shave, A (1995)
- Trainspotting (1996)
- Casablanca (1942)
- Maltese Falcon, The (1941)
- Sunset Blvd. (1950)
- To Catch a Thief (1955)
- East of Eden (1955)
- Treasure of the Sierra Madre, The (1948)
- Big Sleep, The (1946)


## Summary:

### Data Analysis Key Findings

*   The trained model (`trained_model.pkl`), training adjacency matrix (`adj_train.npz`), user features (`u_features.npz`), item features (`v_features.npz`), dataset object (`dataset.pkl`), and class values (`class_values.npy`) were successfully saved to disk.
*   The necessary custom classes and functions from the original training environment were successfully defined within the new script.
*   The saved components were successfully loaded into the new Python environment.
*   The `get_user_recommendations` function was successfully used with the loaded components to generate movie recommendations for a specific user (user ID 1).
*   The generated recommendations for user ID 1 were successfully printed to the console.

### Insights or Next Steps

*   The provided code demonstrates a complete pipeline for saving model and data components and then using them for inference (recommendations) in a separate environment. This process is crucial for deploying trained models.
*   For a production environment, consider packaging the necessary code components (classes and functions) into a reusable library or module to avoid redefining them in every inference script.
