In [17]:
# Standard library imports
import random
import time
from pathlib import Path

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
import torch_geometric
from torch_geometric.data import Dataset, Data
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

In [12]:
BASE_PATH = Path('../../data/movie-lens/ml-1m')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
RATING_THRESHOLD = 3.
N_USERS = 200
N_ITEMS = 500
EMBEDDING_DIM = 64

In [15]:
class MovieLensDataset(Dataset):
    def __init__(self, root_dir, transform=None, n_users=100, n_items=200):
        self.root_dir = root_dir
        self.n_users = n_users
        self.n_items = n_items
        self.transform = transform
        self._load()
        self._to_graph()
        
    def _read_table(self, path, cols, nrows=None, usecols=None):
        df = pd.read_table(
            path,
            sep='::',
            header=None,
            engine='python',
            encoding='latin-1',
            usecols=usecols,
            names=cols,
            nrows=nrows,
        )
        return df
        
    def _load(self):
        self.movies = self._read_table(
            path=self.root_dir/'movies.dat',
            cols=['movie_id', 'title', 'genres'],

        )
        self.users = self._read_table(
            path=self.root_dir/'users.dat',
            cols=['user_id', 'gender', 'age', 'occupation', 'zip'],

        )
        self.ratings = self._read_table(
            path=self.root_dir/'ratings.dat',
            usecols=[0, 1, 2],
            cols=['user_id', 'movie_id', 'rating'],

        )
        self.df = pd.merge(
            pd.merge(self.ratings, self.users), 
            self.movies,
        )
    
    def __getitem__(self, idx):
        assert idx <= self.data.shape[0], 'Index out of range'
        return self.data.iloc[idx, :]
    
    def _to_graph(self):
        self.adj_mat = pd.pivot_table(
            data=self.ratings.merge(
                self.users, 
                left_on='user_id', 
                right_on='user_id',
            )[self.ratings.columns], 
            index='user_id',
            columns='movie_id',
            values='rating',
        )
        self.adj_mat = self.adj_mat.fillna(0)
        self.adj_mat = torch.tensor(self.adj_mat.values, device=DEVICE)
        self.n_users, self.n_items = self.adj_mat.shape
        
        self.data = Data(
            adj_mat=self.adj_mat,
            raw_edge_index=self.adj_mat.clone(),
            ratings=self.ratings,
            users=self.users['user_id'],
            items=self.movies['movie_id'],
        )
        
        if self.transform:
            self.data = self.transform(self.data)
            
    def _split(self, ratio=0.8):
        n_edges = self.n_users * self.n_items
        # why?
        num_train_replaced = round((1-ratio) * n_edges)
        num_val_show = round((1-ratio) * n_edges)

        user_mask = np.random.randint(0, self.n_users, num_train_replaced)
        movie_mask = np.random.randint(0, self.n_items, num_train_replaced)
        
        val_user_mask = np.random.choice(user_mask, num_val_show)
        val_movie_mask = np.random.choice(movie_mask, num_val_show)

        train_mask = torch.ones(self.n_users, self.n_items)
        train_mask[user_mask, movie_mask] = 0

        val_mask = train_mask.clone()
        val_mask[val_user_mask, val_movie_mask] = 1

        test_mask = torch.ones_like(train_mask)

        return train_mask, val_mask, test_mask

In [18]:
def transform_ratings(data):
    return binarize(data, thresh=RATING_THRESHOLD)

def binarize(data, thresh):
    ratings = data['adj_mat']
    ratings[(ratings < thresh)] = 0
    ratings[(ratings >= thresh)] = 1
    data['adj_mat'] = ratings
    return data

ds = MovieLensDataset(
    root_dir=BASE_PATH,
    transform=transform_ratings,
    n_users=N_USERS,
    n_items=N_ITEMS,
)