In [1]:
import torch
import pandas as pd
import numpy as np
import argparse
import random

In [2]:
df_item2id = pd.read_csv('data/movie/item_index2entity_id.txt', sep='\t', header=None, names=['item','id'])
df_kg = pd.read_csv('data/movie/kg.txt', sep='\t', header=None, names=['head','relation','tail'])
df_rating = pd.read_csv('data/movie/ratings.csv')

In [3]:
parser = argparse.ArgumentParser()

parser.add_argument('--dataset', type=str, default='movie', help='which dataset to use')
parser.add_argument('--aggregator', type=str, default='sum', help='which aggregator to use')
parser.add_argument('--n_epochs', type=int, default=10, help='the number of epochs')
parser.add_argument('--neighbor_sample_size', type=int, default=4, help='the number of neighbors to be sampled')
parser.add_argument('--dim', type=int, default=32, help='dimension of user and entity embeddings')
parser.add_argument('--n_iter', type=int, default=2, help='number of iterations when computing entity representation')
parser.add_argument('--batch_size', type=int, default=65536, help='batch size')
parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of l2 regularization')
parser.add_argument('--lr', type=float, default=2e-2, help='learning rate')
parser.add_argument('--ratio', type=float, default=1, help='size of training dataset')

args = parser.parse_args(['--lr', '2e-2'])

In [89]:
class KGCN(torch.nn.Module):
    def __init__(self, num_user, num_ent, num_rel, kg, args):
        super(KGCN, self).__init__()
        self.usr = torch.nn.Embedding(num_user, args.dim)
        self.ent = torch.nn.Embedding(num_ent, args.dim)
        self.rel = torch.nn.Embedding(num_rel, args.dim)
        self.kg = kg
        if args.aggregator == 'concat':
            self.agg_weight = torch.nn.Linear(2 * args.dim, args.dim, bias=True)
        else:
            self.agg_weight = torch.nn.Linear(args.dim, args.dim, bias=True)        
        self.aggregator = args.aggregator
        self.n_iter = args.n_iter
        self.neighbor_sample_size = args.neighbor_sample_size
        
    def forward(self, u, v):
        '''
        u, v as an index (integer)
        vector operation should be executed with embedding ex) self.usr[u]
        '''
        m = self._get_receptive(v)
        e_u_dict = {x: [self.ent[x] if i == 0 else None for i in range(self.n_iter+1)] for x in m[0]}
        # e_u_dict { 3: [[1,2,3,4,5], None, None]}
        for h in range(1, self.n_iter+1):
            for e in m[h]:
                e_u_neighbor = self._message_passing(u, e)
                e_u_dict[e][h] = self._aggregate(e_u_neighbor, e_u_dict[e][h-1])
        v_u = e_u_dict[v][self.n_iter]
        return torch.dot(self.usr[u], v_u)
    
    def _get_receptive(self, v):
        '''
        get receptive field inwardly
        '''
        m = [[] for _ in range(self.n_iter+1)]
        m[self.n_iter].append(v)
        for h in range(self.n_iter-1, -1, -1): # from H-1 to 0
            m[h] = m[h+1]
            for e in m[h+1]:
                # TODO
                # need to set operation to remove duplication???
                m[h].extend(self._get_neighbors(e))
        return m
    
    def _get_neighbors(self, e):
        '''
        return neighbors and relations
        '''
        if len(self.kg[e]) >= self.neighbor_sample_size:
            return [entity for relation, entity in random.sample(self.kg[e], self.neighbor_sample_size)]
        else:
            return [entity for relation, entity in random.choices(self.kg[e], self.neighbor_sample_size)]
        
    def _message_passing(self, u, e):
        # array of (relation, entity_tail)
        neighbors = self._get_neighbors(e)
        relations = [relation for relation, entity in neighbors]
        weights = self._get_weight(u, relations)
        return sum([weight * self.ent[entity] for (relation, entity), weight in zip(neighbors, weights)])
    
    def _get_weight(self, u, relations):
        pi_u_r = [torch.dot(self.usr[u], self.rel[r]) for r in relations]
        pi_u_r = F.softmax(pi_u_r)
        return pi_u_r
    
    def _aggregate(self, v, v_u):
        '''
        Return v^u vector after aggregate v and v^u (sampled)
        Equation 4,5,6 in the paper
        '''
        if self.aggregator == 'sum':
            v_u = self.agg_weight(v + v_u)
        elif self.aggregator == 'concat':
            v_u = self.agg_weight(torch.cat((v, v_u)))
        else:
            v_u = self.agg_weight(v_u)
        return torch.nn.Relu(v_u)

In [9]:
df_rating_final = df_rating[df_rating['movieId'].isin(df_item2id['item'])]
df_rating_final.reset_index(inplace=True)
df_rating_final

Unnamed: 0,index,userId,movieId,rating,timestamp
0,0,1,2,3.5,1112486027
1,1,1,29,3.5,1112484676
2,2,1,32,3.5,1112484819
3,4,1,50,3.5,1112484580
4,5,1,112,3.5,1094785740
...,...,...,...,...,...
13469146,20000257,138493,68319,4.5,1260209720
13469147,20000259,138493,69526,4.5,1259865108
13469148,20000260,138493,69644,3.0,1260209457
13469149,20000261,138493,70286,5.0,1258126944


In [10]:
from sklearn.preprocessing import LabelEncoder
users = np.array(list(set(df_rating_final['userId'])))
entities = np.array(list(set(df_rating_final['movieId']) | set(df_kg['head']) | set(df_kg['tail'])))
relations = np.array(list(set(df_kg['relation'])))

In [11]:
user_encoder = LabelEncoder()
entity_encoder = LabelEncoder()
relation_encoder = LabelEncoder()

In [12]:
user_encoder.fit(users)
entity_encoder.fit(entities)
relation_encoder.fit(relations)

LabelEncoder()

In [78]:
df_kg['head'] = entity_encoder.transform(df_kg['head'])
df_kg['tail'] = entity_encoder.transform(df_kg['tail'])
df_kg['relation'] = relation_encoder.transform(df_kg['relation'])

In [83]:
kg = dict()
for i in range(len(df_kg)):
    head = df_kg.iloc[i]['head']
    relation = df_kg.iloc[i]['relation']
    tail = df_kg.iloc[i]['tail']
    if head in kg:
        kg[head].append((relation, tail))
    else:
        kg[head] = [(relation, tail)]
    if tail in kg:
        kg[tail].append((relation, head))
    else:
        kg[tail] = [(relation, head)]

In [13]:
# df_rating update
df_dataset = pd.DataFrame()
df_dataset['userId'] = user_encoder.transform(df_rating_final['userId'])
df_dataset['movieId'] = user_encoder.transform(df_rating_final['movieId'])
df_dataset['label'] = df_rating_final['rating'].apply(lambda x: 0 if x < 4.0 else 1)

In [14]:
df_dataset

Unnamed: 0,userId,movieId,label
0,0,1,0
1,0,28,0
2,0,31,0
3,0,49,0
4,0,111,0
...,...,...,...
13469146,138492,68318,1
13469147,138492,69525,1
13469148,138492,69643,0
13469149,138492,70285,1


In [15]:
num_user = len(user_encoder.classes_)
num_entity = len(entity_encoder.classes_)
num_relation = len(relation_encoder.classes_)

In [57]:
from sklearn.model_selection import train_test_split

x, x_test, y, y_test = train_test_split(df_dataset, df_dataset['label'], test_size=0.2,train_size=0.8)

In [58]:
class KGCNDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        user_id = np.array(self.df.iloc[idx]['userId'])
        movie_id = np.array(self.df.iloc[idx]['movieId'])
        label = np.array(self.df.iloc[idx]['label'])
        return user_id, movie_id, label

In [59]:
train_dataset = KGCNDataset(x)
test_dataset = KGCNDataset(x_test)

In [60]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)

In [91]:
net = KGCN(num_user, num_entity, num_relation, kg, args)

import torch.optim as optim

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [93]:
for i, (user_id, movie_id, label) in enumerate(train_loader):
    if i > 10:
        break
    loss = 0
    u = user_id.numpy()[0]
    v = movie_id.numpy()[0]
    target = label.numpy()[0]
    print(net(u, v))
    
    

TypeError: 'int' object is not iterable