In [109]:
# Standard library imports
import random
import time
from pathlib import Path
import itertools

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

In [85]:
BASE_PATH = Path('data/movie-lens/ml-latest-small')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
RATING_THRESHOLD = 3.
N_USERS = 200
N_ITEMS = 500
EMBEDDING_DIM = 64
SEED = 42

np.random.seed(SEED)

In [214]:
df_ratings = pd.read_csv(BASE_PATH/'ratings.csv').drop(labels='timestamp', axis=1)
df_movies = pd.read_csv(BASE_PATH/'movies.csv')
df_links = pd.read_csv(BASE_PATH/'links.csv')
df_tags = pd.read_csv(BASE_PATH/'tags.csv')

In [87]:
df_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [94]:
df_items.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [95]:
df_tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


In [195]:
df_movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [206]:
df_items, x = process_movies(df_movies)
df_items[['movieId', 'feature']].head(10)

Unnamed: 0,movieId,feature
0,1,111100010000000000
1,2,101000010000000000
2,3,100000000010000
3,4,100100000010000
4,5,100000000000000
5,6,1000010000000000100
6,7,100000000010000
7,8,101000000000000000
8,9,1000000000000000000
9,10,1100000000000000100


### Utility Functions

In [291]:
def map_data(data):
    uniq = list(set(data))
    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array([id_dict[x] for x in data])
    n = len(uniq)

    return data, id_dict, n

def shuffle_df(df):
    rand_idx = np.random.randint(0, df.shape[0], df.shape[0])
    df = df.iloc[rand_idx, :].reset_index(drop=True)
    return df

def get_nodes(df_ratings):
    df_ratings = shuffle_df(df_ratings)
    rated_users = df_ratings.values[:, 0]
    rated_items = df_ratings.values[:, 1]
    ratings = df_ratings.values[:, 2]
    
    rated_users, rated_users_dict, num_users = map_data(rated_users)
    rated_items, rated_items_dict, num_items = map_data(rated_items)
    
    return rated_users, rated_users_dict, num_users, rated_items, rated_items_dict, num_items, ratings

def get_features(df_items, idx_map, sparse=True):
    n_items = df_items.shape[0]
    f_len = len(list(df_items.loc[0, 'feature']))
    features = np.zeros((n_items, f_len), dtype=np.float32)
    
    for movie_id, feature_vec in df_items[['movieId', 'feature']].values.tolist():
        if movie_id in idx_map:
            features[idx_map[movie_id], :] = list(feature_vec)
            
    if sparse:
        features = sp.csr_matrix(features)
            
    return features

def process_movies(df_movies):
    genres = set()
    lists = df_movies.genres.values
    
    def encode(movie_genres):
        res = []
        for genre in genres:
            if genre in movie_genres:
                res.append(1)
            else:
                res.append(0)
        return res + [''.join([str(i) for i in res])]
    
    for idx, lis in enumerate(lists):
        for genre in lis.split('|'):
            genres.add(genre)
            
    genres = sorted(genres)    
    df_movies[genres + ['feature']] = df_movies.apply(lambda x: encode(x.genres), 1).values.tolist()
        
    return df_movies.drop(labels='genres', axis=1), genres

def split(data, rating_dict, ratio=0.8):
    rated_users, rated_items, ratings = data
    n = rated_items.shape[0]
    n_train = int(n * ratio)
    stacked = np.vstack([rated_users, rated_items]).T
    train_pairs_idx = stacked[:n_train]
    test_pairs_idx = stacked[n_train:]
    
    user_train_idx, item_train_idx = train_pairs_idx.transpose()
    user_test_idx, item_test_idx = test_pairs_idx.transpose()
    
    labels = np.array([rating_dict[r] for r in ratings], dtype=np.int32)
    train_labels = labels[:n_train]
    test_labels = labels[n_train:]
    
    return user_train_idx, item_train_idx, user_test_idx, item_test_idx, train_labels, test_labels

In [284]:
rated_users, rated_users_dict, num_users, rated_items, rated_items_dict, num_items, ratings = get_nodes(df_ratings)
item_features = get_features(df_items, rated_items_dict)

rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

In [271]:
rated_users.shape, rated_items.shape, ratings.shape

((100836,), (100836,), (100836,))

In [290]:
samples = (rated_users, rated_items, ratings)
user_train_idx, item_train_idx, user_test_idx, item_test_idx, train_labels, test_labels = split(samples, rating_dict)
user_train_idx.shape, user_test_idx.shape, train_labels.shape

((80668,), (20168,), (80668,))