In [1]:
# Standard library imports
import random
import time
from pathlib import Path
import itertools

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.special import softmax
pd.set_option('display.max_colwidth', None)

import torch
from torch.optim import Adam
from torch.nn import Linear
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch_geometric
from torch_geometric.data import Data, Dataset, InMemoryDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree, dropout_adj
from torch_geometric.nn import RGCNConv, GCNConv

from helper.utils import *



In [2]:
BASE_PATH = Path('data/movie-lens/ml-latest-small')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 42
EPOCHS=2
BATCH_SIZE=512
LR=1e-3
LR_DECAY_STEP = 20
LR_DECAY_VALUE = 10

np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device('cpu')
if torch.cuda.is_available():
    torch.cuda.manual_seed(123)
    torch.cuda.synchronize()
    device = torch.device('cuda')
device

device(type='cpu')

In [3]:
df_movies = pd.read_csv(BASE_PATH/'movies.csv')
df_links = pd.read_csv(BASE_PATH/'links.csv')
df_tags = pd.read_csv(BASE_PATH/'tags.csv')

In [4]:
df_movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [5]:
df_tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


In [6]:
df_movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [7]:
def myround(x, base=0.5):
    return base * round(x/base)

In [8]:
def to_range(x, t_min, t_max):
    x_min = x.min()
    x_max = x.max()
    y = ((x - x_min)/(x_max - x_min)) * (t_max - t_min) + t_min
    return y

In [9]:
def transform_ratings(df_ratings):
    user_mean_ratings = df_ratings.groupby('userId').mean().drop('movieId', axis=1).reset_index()
    temp = pd.merge(df_ratings, user_mean_ratings, on='userId')
    z = temp['rating_x'] - temp['rating_y']
    temp['rating'] = myround(to_range(z, 0, 5))
    # temp['rating'] += temp['rating'].min()
    return temp.drop(labels=['rating_x', 'rating_y'], axis=1)

# df_ratings = pd.read_csv(BASE_PATH/'ratings.csv').drop(labels='timestamp', axis=1)
# df_ratings = transform_ratings(df_ratings)

In [10]:
user_features = np.zeros((5, 5))

In [11]:
user_features[0, 3] = 1

In [12]:
user_features

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

### Utility Functions

In [13]:
def map_data(data):
    uniq = list(set(data))
    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array([id_dict[x] for x in data])
    n = len(uniq)

    return data, id_dict, n

def shuffle_df(df):
    rand_idx = np.random.randint(0, df.shape[0], df.shape[0])
    df = df.iloc[rand_idx, :].reset_index(drop=True)
    return df

def get_nodes(df_ratings):
    df_ratings = shuffle_df(df_ratings)
    rated_users = df_ratings.values[:, 0]
    rated_items = df_ratings.values[:, 1]
    ratings = df_ratings.values[:, 2]
    
    rated_users, rated_users_dict, num_users = map_data(rated_users)
    rated_items, rated_items_dict, num_items = map_data(rated_items)
    
    return rated_users, rated_users_dict, num_users, rated_items, rated_items_dict, num_items, ratings

def get_user_features(df_ratings, df_items, genres, genres_map, rated_users_dict, n, sparse=False):
    n_users = len(df_ratings.userId.unique())
    f_len = len(genres_map)
    x = pd.merge(df_ratings, df_items).drop(labels=['title', 'movieId', 'rating', 'feature'], axis=1)
    x = x.groupby('userId')[genres].mean()
    y = x.apply(lambda x: pd.Series((x.nlargest(n))), axis=1).notna().reset_index()
    
    user_features = np.zeros((n_users, f_len))
    for i in range(y.shape[0]):
        temp = [0] * f_len
        for col in y.columns:
            if y.loc[i, col] == True:
                if col in genres_map:
                    user_features[rated_users_dict[y.loc[i, 'userId']], genres_map[col]] = 1
        
    if sparse:
        user_features = sp.csr_matrix(item_features)
    
    return user_features

def get_item_features(df_items, idx_map, sparse=False):
    n_items = df_items.shape[0]
    f_len = len(list(df_items.loc[0, 'feature']))
    item_features = np.zeros((n_items, f_len), dtype=np.float32)
    
    for movie_id, feature_vec in df_items[['movieId', 'feature']].values.tolist():
        if movie_id in idx_map:
            item_features[idx_map[movie_id], :] = list(feature_vec)
            
    if sparse:
        item_features = sp.csr_matrix(item_features)
            
    return item_features

def process_movies(df_movies):
    genres = set()
    lists = df_movies.genres.values
    mp = {}
    
    def encode(movie_genres):
        res = []
        for idx, genre in enumerate(genres):
            if genre in movie_genres:
                res.append(1)
            else:
                res.append(0)
            mp[genre] = idx
        return res + [''.join([str(i) for i in res])]
    
    for idx, lis in enumerate(lists):
        for genre in lis.split('|'):
            genres.add(genre)
            
    genres = sorted(genres)    
    df_movies[genres + ['feature']] = df_movies.apply(lambda x: encode(x.genres), 1).values.tolist()
        
    return df_movies.drop(labels='genres', axis=1), genres, mp

def split(data, rating_dict, ratio=0.8):
    rated_users, rated_items, ratings = data
    n = rated_items.shape[0]
    n_train = int(n * ratio)
    stacked = np.vstack([rated_users, rated_items]).T
    train_pairs_idx = stacked[:n_train]
    test_pairs_idx = stacked[n_train:]
    
    user_train_idx, item_train_idx = train_pairs_idx.transpose()
    user_test_idx, item_test_idx = test_pairs_idx.transpose()
    
    labels = np.array([rating_dict[r] for r in ratings], dtype=np.int32)
    train_labels = labels[:n_train]
    test_labels = labels[n_train:]
    
    return user_train_idx, item_train_idx, user_test_idx, item_test_idx, train_labels, test_labels

In [14]:
class MovieLensDataset(Dataset):
    def __init__(self, adj_mat, links, labels, h, sample_ratio, max_nodes_per_hop, 
                 u_features, v_features, class_values, max_num=None, root='data/movie-lens/ml-latest-small/'):
        super(MovieLensDataset, self).__init__(root)
        self.Arow = SparseRowIndexer(adj_mat)
        self.Acol = SparseColIndexer(adj_mat.tocsc())
        self.links = links
        self.labels = labels
        self.h = h
        self.sample_ratio = sample_ratio
        self.max_nodes_per_hop = max_nodes_per_hop
        self.u_features = u_features
        self.v_features = v_features
        self.class_values = class_values
        if max_num is not None:
            np.random.seed(123)
            num_links = len(links[0])
            perm = np.random.permutation(num_links)
            perm = perm[:max_num]
            self.links = (links[0][perm], links[1][perm])
            self.labels = labels[perm]

    def len(self):
        return len(self.links[0])

    def get(self, idx):
        i, j = self.links[0][idx], self.links[1][idx]
        g_label = self.labels[idx]
        tmp = subgraph_extraction_labeling(
            (i, j), self.Arow, self.Acol, self.h, self.sample_ratio, self.max_nodes_per_hop, 
            self.u_features, self.v_features, self.class_values, g_label
        )
        return construct_pyg_graph(*tmp)
    
class IGMC(torch.nn.Module):
    def __init__(self):
        super(IGMC, self).__init__()
        self.rel_graph_convs = torch.nn.ModuleList()
        self.rel_graph_convs.append(RGCNConv(in_channels=4, out_channels=32, num_relations=5, num_bases=4))
        self.rel_graph_convs.append(RGCNConv(in_channels=32, out_channels=32, num_relations=5, num_bases=4))
        self.rel_graph_convs.append(RGCNConv(in_channels=32, out_channels=32, num_relations=5, num_bases=4))
        self.rel_graph_convs.append(RGCNConv(in_channels=32, out_channels=32, num_relations=5, num_bases=4))
        self.linear_layer1 = Linear(256, 128)
        self.linear_layer2 = Linear(128, 1)

    def reset_parameters(self):
        self.linear_layer1.reset_parameters()
        self.linear_layer2.reset_parameters()
        for i in self.rel_graph_convs:
            i.reset_parameters()

    def forward(self, data):
        num_nodes = len(data.x)
        edge_index_dr, edge_type_dr = dropout_adj(data.edge_index, data.edge_type, p=0.2, num_nodes=num_nodes, training=self.training)

        out = data.x
        h = []
        for conv in self.rel_graph_convs:
            out = conv(out, edge_index_dr, edge_type_dr)
            out = torch.tanh(out)
            h.append(out)
        h = torch.cat(h, 1)
        h = [h[data.x[:, 0] == True], h[data.x[:, 1] == True]]
        g = torch.cat(h, 1)
        out = self.linear_layer1(g)
        out = F.relu(out)
        out = F.dropout(out, p=0.5, training=self.training)
        out = self.linear_layer2(out)
        out = out[:,0]
        return out

In [52]:
class GNN(torch.nn.Module):
    # a base GNN class, GCN message passing + sum_pooling
    def __init__(self, dataset, gconv=GCNConv, latent_dim=[32, 32, 32, 1], 
                 regression=False, adj_dropout=0.2, force_undirected=False):
        super(GNN, self).__init__()
        self.regression = regression
        self.adj_dropout = adj_dropout 
        self.force_undirected = force_undirected
        # self.convs = torch.nn.ModuleList()
        # self.convs.append(gconv(dataset.num_features, latent_dim[0]))
        # for i in range(0, len(latent_dim)-1):
        #     self.convs.append(gconv(latent_dim[i], latent_dim[i+1]))
        self.lin1 = Linear(sum(latent_dim), 128)
        if self.regression:
            self.lin2 = Linear(128, 1)
        else:
            self.lin2 = Linear(128, dataset.num_classes)

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        self.lin1.reset_parameters()
        self.lin2.reset_parameters()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if self.adj_dropout > 0:
            edge_index, edge_type = dropout_adj(
                edge_index, edge_type, p=self.adj_dropout, 
                force_undirected=self.force_undirected, num_nodes=len(x), 
                training=self.training
            )
        concat_states = []
        for conv in self.convs:
            x = torch.tanh(conv(x, edge_index))
            concat_states.append(x)
        concat_states = torch.cat(concat_states, 1)
        x = global_add_pool(concat_states, batch)
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        if self.regression:
            return x[:, 0]
        else:
            return F.log_softmax(x, dim=-1)

    def __repr__(self):
        return self.__class__.__name__
    
class IGMC(GNN):
    # The GNN model of Inductive Graph-based Matrix Completion. 
    # Use RGCN convolution + center-nodes readout.
    def __init__(
        self, dataset, gconv=RGCNConv, latent_dim=[32, 32, 32, 32, 32], 
        num_relations=5, num_bases=2, regression=False, adj_dropout=0.2, 
        force_undirected=False, side_features=False, n_side_features=0, 
        multiply_by=1,
    ):
        super(IGMC, self).__init__(
            dataset, RGCNConv, latent_dim, regression, adj_dropout, force_undirected
        )
        self.multiply_by = multiply_by
        self.convs = torch.nn.ModuleList()
        self.convs.append(gconv(dataset.num_features, latent_dim[0], num_relations, num_bases))
        for i in range(0, len(latent_dim)-1):
            self.convs.append(gconv(latent_dim[i], latent_dim[i+1], num_relations, num_bases))
        self.lin1 = Linear(2*sum(latent_dim), 128)
        self.side_features = side_features
        if side_features:
            self.lin1 = Linear(2*sum(latent_dim)+n_side_features, 128)

    def forward(self, data):
        start = time.time()
        x, edge_index, edge_type, batch = data.x, data.edge_index, data.edge_type, data.batch
        if self.adj_dropout > 0:
            edge_index, edge_type = dropout_adj(
                edge_index, edge_type, p=self.adj_dropout, 
                force_undirected=self.force_undirected, num_nodes=len(x), 
                training=self.training
            )
        concat_states = []
        for conv in self.convs:
            x = torch.tanh(conv(x, edge_index, edge_type))
            concat_states.append(x)
        concat_states = torch.cat(concat_states, 1)

        users = data.x[:, 0] == 1
        items = data.x[:, 1] == 1
        x = torch.cat([concat_states[users], concat_states[items]], 1)
        if self.side_features:
            x = torch.cat([x, data.u_feature, data.v_feature], 1)

        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        if self.regression:
            return x[:, 0] * self.multiply_by
        else:
            return F.log_softmax(x, dim=-1)

In [53]:
df_items, genres, genres_mp  = process_movies(df_movies)

df_ratings = pd.read_csv(BASE_PATH/'ratings.csv').drop(labels='timestamp', axis=1)
# df_ratings = transform_ratings(df_ratings)

rated_users, rated_users_dict, num_users, rated_items, rated_items_dict, num_items, ratings = get_nodes(df_ratings)
item_features = get_item_features(df_items, rated_items_dict, sparse=False)
user_features = get_user_features(df_ratings, df_items, genres, genres_mp, rated_users_dict, n=2, sparse=False)
class_values = np.sort(np.unique(ratings))
rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

samples = (rated_users, rated_items, ratings)
user_train_idx, item_train_idx, user_test_idx, item_test_idx, train_labels, test_labels = split(samples, rating_dict)

data = train_labels + 1.
data = data.astype(np.float32)
adj_mat = sp.csr_matrix(
    (data, [user_train_idx, item_train_idx]), 
    shape=[num_users, num_items], 
    dtype=np.float32,
)

In [54]:
train_dataset = MovieLensDataset(
    root='data/movie-lens/ml-latest-small/',
    adj_mat=adj_mat,
    links=(user_train_idx, item_train_idx),
    labels=train_labels,
    h=1,
    sample_ratio=1,
    max_nodes_per_hop=200,
    # u_features=None,
    # v_features=None,
    u_features=user_features,
    v_features=item_features,
    class_values=class_values,
)

test_dataset = MovieLensDataset(
    adj_mat=adj_mat,
    links=(user_test_idx, item_test_idx),
    labels=test_labels,
    h=1,
    sample_ratio=1,
    max_nodes_per_hop=200,
    # u_features=None,
    # v_features=None,
    u_features=user_features,
    v_features=item_features,
    class_values=class_values,
)

train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False, num_workers=0)

In [55]:
# model = IGMC()

num_relations = len(class_values)
n_features = user_features.shape[1] + item_features.shape[1]
model = IGMC(
    train_dataset, 
    num_relations=num_relations, 
    num_bases=4, 
    regression=True, 
    adj_dropout=True, 
    force_undirected=True, 
    side_features=False, 
    n_side_features=n_features, 
    multiply_by=1
)

In [56]:
model.to(device)
model.reset_parameters()
optimizer = Adam(model.parameters(), lr=LR, weight_decay=0)

for epoch in range(1, EPOCHS+1):
    model.train()
    train_loss_all = 0
    for idx, train_batch in enumerate(train_loader):
        optimizer.zero_grad()
        train_batch = train_batch.to(device)
        # print(train_batch.x.shape)
        y_pred = model(train_batch)
        y_true = train_batch.y
        train_loss = F.mse_loss(y_pred, y_true)
        if idx % 20 == 0:
            print(round(train_loss.item(), 2), end=' ')
        train_loss.backward()
        train_loss_all += BATCH_SIZE * float(train_loss)
        optimizer.step()
        torch.cuda.empty_cache()
        
    train_loss_all = train_loss_all / len(train_loader.dataset)
    
    print('epoch', epoch,'; train loss', train_loss_all)

    if epoch % LR_DECAY_STEP == 0:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] / LR_DECAY_VALUE

12.85 2.6 1.59 1.42 1.53 1.46 1.35 1.53 epoch 1 ; train loss 1.999062024448153
1.53 1.36 1.46 1.31 1.4 1.33 1.52 1.46 epoch 2 ; train loss 1.4232135514886246


In [57]:
model.eval()
test_loss = 0
for test_batch in test_loader:
    test_batch = test_batch.to(device)
    with torch.no_grad():
        y_pred = model(test_batch)
    y_true = test_batch.y
    test_loss += F.mse_loss(y_pred, y_true, reduction='sum')
    # torch.cuda.empty_cache()
mse_loss = float(test_loss) / len(test_loader.dataset)

print('test MSE loss', mse_loss)
print('test RMSE loss', math.sqrt(mse_loss))

test MSE loss 6.310193205201309
test RMSE loss 2.5120097940098303
