In [1]:
import csv

In [2]:
def read_reddit():
    interactions = []
    with open('./data/reddit.csv') as csvfile:
        datareader = csv.reader(csvfile)
        for i, row in enumerate(datareader):
            if i==0:
                continue
            interactions.append((row[0], row[1], float(row[2])))
    print("Read interactions")
    return interactions

def read_select(interactions, start=0, end=0.8):
    user_to_item = {}
    item_to_user = {}
    for i, (user_id, item_id, tstamp) in enumerate(interactions):
        if i < start * len(interactions) or i > end * len(interactions):
            continue
        if user_id not in user_to_item:
            user_to_item[user_id] = []
        user_to_item[user_id].append(item_id)

        if item_id not in item_to_user:
            item_to_user[item_id] = []
        item_to_user[item_id].append(item_id)
    print("Generated user_to_item lists")
    return user_to_item, item_to_user

interactions = read_reddit()

Read interactions


In [3]:
### Training time
user_to_item, item_to_user = read_select(interactions, 0, 0.8)
print(len(user_to_item.keys()))
print(len(item_to_user.keys()))

Generated user_to_item lists
9946
981


In [4]:
import networkx as nx
from tqdm import tqdm, trange, tqdm_notebook, tnrange

In [5]:
itemG = nx.Graph()
with tnrange(len(item_to_user.keys())) as item_progressbar:    
    for i, item1 in zip(item_progressbar, item_to_user.keys()):
        item_progressbar.set_description("Processed {}/{} items.".format(i, len(item_to_user.keys())))
        itemG.add_node(item1)
        nbrs = set()
        for user in item_to_user[item1]:
            nbrs.update(user_to_item[user])
        for item2 in nbrs:
            itemG.add_edge(item1, item2)
            itemG.add_edge(item2, item1)
    print("Constructed Item Graph")

HBox(children=(IntProgress(value=0, max=981), HTML(value='')))


Constructed Item Graph


In [6]:
userG = nx.Graph()
with tnrange(len(user_to_item.keys())) as user_progressbar:    
    for i, user1 in zip(user_progressbar, user_to_item.keys()):
        user_progressbar.set_description("Processed {}/{} users.".format(i, len(user_to_item.keys())))
        userG.add_node(user1)
        nbrs = set()
        for item in user_to_item[user1]:
            nbrs.update(item_to_user[item])
        for user2 in nbrs:
            userG.add_edge(user1, user2)
            userG.add_edge(user2, user1)
print("Constructed User Graph")

HBox(children=(IntProgress(value=0, max=9946), HTML(value='')))


Constructed User Graph


Jodie Original Performance

*** Validation performance of epoch 9 ***  
Mean Reciprocal Rank: 0.7058091714668168  
Recall@10: 0.8078816268867574  


*** Test performance of epoch 9 ***  
Mean Reciprocal Rank: 0.7253162947486189  
Recall@10: 0.8497434753513272  


Node2Vec + Vector space transformation  
*** Validation performance of epoch 10 ***  
Mean Reciprocal Rank: 0.007162006871968379  
Recall@10: 0.018335935757305376  


*** Test performance of epoch 10 ***  
Mean Reciprocal Rank: 0.00823016722486954  
Recall@10: 0.02526581902000149  

Node2Vec + Item embedding  
*** Validation performance of epoch 10 ***  
Mean Reciprocal Rank: 0.005439007385994447  
Recall@10: 0.004580266190794854  


*** Test performance of epoch 10 ***  
Mean Reciprocal Rank: 0.005632637876430475  
Recall@10: 0.005219718938211019  


In [7]:
list(userG.nodes())[:10]

['0', '72', '15', '1', '503', '347', '2', '3', '432', '4']

In [8]:
list(itemG.nodes())[:10]

['0', '72', '15', '1', '503', '347', '2', '3', '432', '4']

usage: node2vec  
   -i:Input graph path (default:'graph/karate.edgelist')  
   -o:Output graph path (default:'emb/karate.emb')  
   -d:Number of dimensions. Default is 128 (default:128)  
   -l:Length of walk per source. Default is 80 (default:80)  
   -r:Number of walks per source. Default is 10 (default:10)  
   -k:Context size for optimization. Default is 10 (default:10)  
   -e:Number of epochs in SGD. Default is 1 (default:1)  
   -p:Return hyperparameter. Default is 1 (default:1)  
   -q:Inout hyperparameter. Default is 1 (default:1)  
   -v Verbose output.   
   -dr Graph is directed.   
   -w Graph is weighted.   
   -ow Output random walks instead of embeddings.   

In [9]:
### Use node2vec on item graph
nx.write_edgelist(itemG, '/tmp/v2020_elist_item.edgelist')
!/home/viresh16118/bin/node2vec -i:'/tmp/v2020_elist_item.edgelist' -o:'/home/viresh16118/repos/jodie_orig/node2vec_embedding/items_2.emb'


An algorithmic framework for representational learning on graphs. [Aug 13 2019]
Input graph path (-i:)=/tmp/v2020_elist_item.edgelist
Output graph path (-o:)=/home/viresh16118/repos/jodie_orig/node2vec_embedding/items_2.emb
Number of dimensions. Default is 128 (-d:)=128
Length of walk per source. Default is 80 (-l:)=80
Number of walks per source. Default is 10 (-r:)=10
Context size for optimization. Default is 10 (-k:)=10
Number of epochs in SGD. Default is 1 (-e:)=1
Return hyperparameter. Default is 1 (-p:)=1
Inout hyperparameter. Default is 1 (-q:)=1
Verbose output. (-v)=NO
Graph is directed. (-dr)=NO
Graph is weighted. (-w)=NO
Output random walks instead of embeddings. (-ow)=NO


In [10]:
### Use node2vec on user graph
nx.write_edgelist(userG, '/tmp/v2020_elist_user.edgelist')
!/home/viresh16118/bin/node2vec -i:'/tmp/v2020_elist_user.edgelist' -o:'/home/viresh16118/repos/jodie_orig/node2vec_embedding/users_2.emb'


An algorithmic framework for representational learning on graphs. [Aug 13 2019]
Input graph path (-i:)=/tmp/v2020_elist_user.edgelist
Output graph path (-o:)=/home/viresh16118/repos/jodie_orig/node2vec_embedding/users_2.emb
Number of dimensions. Default is 128 (-d:)=128
Length of walk per source. Default is 80 (-l:)=80
Number of walks per source. Default is 10 (-r:)=10
Context size for optimization. Default is 10 (-k:)=10
Number of epochs in SGD. Default is 1 (-e:)=1
Return hyperparameter. Default is 1 (-p:)=1
Inout hyperparameter. Default is 1 (-q:)=1
Verbose output. (-v)=NO
Graph is directed. (-dr)=NO
Graph is weighted. (-w)=NO
Output random walks instead of embeddings. (-ow)=NO


In [11]:
### load embeddings
from gensim.models import KeyedVectors
user_embeddings = KeyedVectors.load_word2vec_format("./node2vec_embedding/users_2.emb", binary=False)
item_embeddings = KeyedVectors.load_word2vec_format("./node2vec_embedding/items_2.emb", binary=False)
print(len(user_embeddings.vocab), len(item_embeddings.vocab), len(user_embeddings['0']))

9946 981 128


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class Net(nn.Module):
    def __init__(self, esz, tsz, isz):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(esz + tsz + esz, 50)
        self.fc2 = nn.Linear(50, isz)

    def forward(self, x, t, iminus):
        x = F.relu(self.fc1(torch.cat([x, t, iminus], dim=1)))
        x = self.fc2(x)
        return x

user_embsize = 128
item_embsize = 128
t_size = 1
net = Net(user_embsize, t_size, item_embsize)
print("Constructed network")

Constructed network


In [102]:
urnd = torch.rand(128)
t = torch.rand(1)
irnd = torch.rand(128)
urnd.unsqueeze(0).size()
# net(urnd, t).size()

torch.Size([1, 128])

In [14]:
learning_rate = 1e-4
train_idx = int(0.8 * len(interactions))
batch_size = 200
epochs = 10

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

running_losses = []
last_interacted_with = {}

with tnrange(epochs) as pbar1:
    for epch in range(epochs):
        pbar1.set_description("Epoch {}/{}".format(epch, epochs))

        running_loss = 0
        batches = np.array_split(interactions, len(interactions)/batch_size)
        with tnrange(len(batches)) as pbar2:
            for i, batch in enumerate(batches):
                pbar2.set_description("Batch {}/{}".format(i, len(batches)))
                # extract from batches
                try:
                    users = torch.Tensor(user_embeddings[batch[:, 0]])
                except:
                    users = torch.Tensor([user_embeddings[k] if k in user_to_item.keys() else np.zeros(128) for k in batch[:,0]])
                
                last_items = [last_interacted_with.get(k, str(len(item_to_user)-1)) for k in batch[:, 0]]
                last_items = torch.Tensor([item_embeddings[k] if k in item_to_user.keys() else np.zeros(128) for k in last_items])
                
                try:
                    items = torch.Tensor(item_embeddings[batch[:, 1]])
                except:
                    items = torch.Tensor([item_embeddings[k] if k in item_to_user.keys() else np.zeros(128) for k in batch[:,0]])
                times = torch.Tensor(batch[:, 2].astype(np.float)).unsqueeze(1)
                
                for u, i in zip(batch[:, 0], batch[:, 1]):
                    last_interacted_with[u] = i

                pred = net(users, times, last_items)

                optimizer.zero_grad()
                loss = loss_fn(pred, items)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
            print("Average batch loss: {}".format(running_loss / (len(interactions)/ batch_size)))
            running_losses.append(running_loss)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 10294.443377721102


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 45.643398868449324


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 44.853548440332894


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 44.44859980847469


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 45.11477737910807


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 45.64386489955609


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 44.490684999985504


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 44.24392561029856


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 43.75824873707294


HBox(children=(IntProgress(value=0, max=3362), HTML(value='')))

Average batch loss: 45.932748507797626



Compute validation and testing mrr and recall@10

In [18]:
test_idx = 0.9 * len(interactions)
validation_ranks = []
test_ranks = []
last_interacted_with = {}
with tnrange(len(interactions[train_idx:])) as pbar3:
    for i, (user_id, item_id, timestamp) in enumerate(interactions[train_idx:]):
        pbar3.set_description("Interaction {}/{}".format(i, len(interactions) - train_idx))
        # extract from batches
        usere = torch.Tensor(user_embeddings[user_id] if user_id in user_to_item.keys() else np.zeros(128)).unsqueeze(0)
        iteme = torch.Tensor(item_embeddings[item_id] if item_id in item_to_user.keys() else np.zeros(128)).unsqueeze(0)
        times = torch.Tensor([timestamp]).unsqueeze(0)
        lastid = last_interacted_with.get(user_id, str(len(item_to_user)-1))
        laste = torch.Tensor(item_embeddings[lastid] if lastid in item_to_user.keys() else np.zeros(128)).unsqueeze(0)
        
        with torch.no_grad():
            pred = net(usere, times, laste)
        
        last_interacted_with[user_id] = item_id
        
        euclidean_dists = nn.PairwiseDistance()(pred.repeat(len(item_to_user.keys()), 1), torch.Tensor(item_embeddings[sorted(item_to_user.keys())]))
        
        try:
            iindex = sorted(item_to_user.keys()).index(item_id)
        except:
            iindex = None
            
        if iindex is None:
            true_item_dist = np.inf
        else:
            true_item_dist = euclidean_dists[iindex]

            
        eucl_smaller = (euclidean_dists < true_item_dist).numpy()
        item_rank = np.sum(eucl_smaller) + 1
        
        if i < test_idx:
            validation_ranks.append(item_rank)
        else:
            test_ranks.append(item_rank)

HBox(children=(IntProgress(value=0, max=134490), HTML(value='')))




In [196]:
len(interactions)

672447

In [186]:
sorted(item_to_user.keys()).index('2')

112

In [19]:
raw_ranks = validation_ranks.copy()
vnum = int(0.9*len(interactions) - train_idx)
tnum = len(raw_ranks) - vnum

vranks = raw_ranks[:vnum]
tranks = raw_ranks[vnum:]
print(len(vranks), len(tranks))

67245 67245


In [20]:
performance_dict = dict()
ranks = vranks
mrr = np.mean([1.0 / r for r in ranks])
rec10 = sum(np.array(ranks) <= 10)*1.0 / len(ranks)
performance_dict['validation'] = [mrr, rec10]

ranks = tranks
mrr = np.mean([1.0 / r for r in ranks])
rec10 = sum(np.array(ranks) <= 10)*1.0 / len(ranks)
performance_dict['test'] = [mrr, rec10]

metrics = ['Mean Reciprocal Rank', 'Recall@10']
print('\n\n*** Validation performance of epoch {} ***'.format(epochs))
for i in range(len(metrics)):
    print((metrics[i] + ': ' + str(performance_dict['validation'][i])))


print('\n\n*** Test performance of epoch {} ***'.format(epochs))
for i in range(len(metrics)):
    print((metrics[i] + ': ' + str(performance_dict['test'][i])))




*** Validation performance of epoch 10 ***
Mean Reciprocal Rank: 0.005439007385994447
Recall@10: 0.004580266190794854


*** Test performance of epoch 10 ***
Mean Reciprocal Rank: 0.005632637876430475
Recall@10: 0.005219718938211019


In [200]:
len(test_ranks)

0

In [201]:
len(validation_ranks)

134490

In [202]:
test_idx

605202.3