In [1]:
import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
import pickle
import numpy as np
import time
import random
from collections import defaultdict
from socialrec.UV_Encoders import UV_Encoder
from socialrec.UV_Aggregators import UV_Aggregator
from socialrec.Social_Encoders import Social_Encoder
from socialrec.Social_Aggregators import Social_Aggregator
import torch.nn.functional as F
import torch.utils.data
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
import datetime
import argparse
import os
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder

In [2]:
torch.__version__

'1.6.0'

# Reference

GraphRec: Graph Neural Networks for Social Recommendation. 
Wenqi Fan, Yao Ma, Qing Li, Yuan He, Eric Zhao, Jiliang Tang, and Dawei Yin. 
In Proceedings of the 28th International Conference on World Wide Web (WWW), 2019. Preprint[https://arxiv.org/abs/1902.07243]


In [3]:
# Graph Rec

In [4]:
class GraphRec(nn.Module):

    def __init__(self, enc_u, enc_v_history, r2e):
        super(GraphRec, self).__init__()
        self.enc_u = enc_u
        self.enc_v_history = enc_v_history
        self.embed_dim = enc_u.embed_dim

        self.w_ur1 = nn.Linear(self.embed_dim, self.embed_dim)
        self.w_ur2 = nn.Linear(self.embed_dim, self.embed_dim)
        self.w_vr1 = nn.Linear(self.embed_dim, self.embed_dim)
        self.w_vr2 = nn.Linear(self.embed_dim, self.embed_dim)
        self.w_uv1 = nn.Linear(self.embed_dim * 2, self.embed_dim)
        self.w_uv2 = nn.Linear(self.embed_dim, 16)
        self.w_uv3 = nn.Linear(16, 1)
        self.r2e = r2e
        self.bn1 = nn.BatchNorm1d(self.embed_dim, momentum=0.5)
        self.bn2 = nn.BatchNorm1d(self.embed_dim, momentum=0.5)
        self.bn3 = nn.BatchNorm1d(self.embed_dim, momentum=0.5)
        self.bn4 = nn.BatchNorm1d(16, momentum=0.5)
        self.criterion = nn.MSELoss()

    def forward(self, nodes_u, nodes_v):
        embeds_u = self.enc_u(nodes_u)
        embeds_v = self.enc_v_history(nodes_v)

        x_u = F.relu(self.bn1(self.w_ur1(embeds_u)))
        x_u = F.dropout(x_u, training=self.training)
        x_u = self.w_ur2(x_u)
        x_v = F.relu(self.bn2(self.w_vr1(embeds_v)))
        x_v = F.dropout(x_v, training=self.training)
        x_v = self.w_vr2(x_v)

        x_uv = torch.cat((x_u, x_v), 1)
        x = F.relu(self.bn3(self.w_uv1(x_uv)))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.bn4(self.w_uv2(x)))
        x = F.dropout(x, training=self.training)
        scores = self.w_uv3(x)
        return scores.squeeze()

    def loss(self, nodes_u, nodes_v, labels_list):
        scores = self.forward(nodes_u, nodes_v)
        return self.criterion(scores, labels_list)


In [5]:
# Train Loop

In [6]:

def train(model, device, train_loader, optimizer, epoch, best_rmse, best_mae):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        batch_nodes_u, batch_nodes_v, labels_list = data
        optimizer.zero_grad()
        loss = model.loss(batch_nodes_u.to(device), batch_nodes_v.to(device), labels_list.to(device))
        loss.backward(retain_graph=True)
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 0:
            print('[%d, %5d] loss: %.3f, The best rmse/mae: %.6f / %.6f' % (
                epoch, i, running_loss / 100, best_rmse, best_mae))
            running_loss = 0.0
    return 0


def test(model, device, test_loader):
    model.eval()
    tmp_pred = []
    target = []
    with torch.no_grad():
        for test_u, test_v, tmp_target in test_loader:
            test_u, test_v, tmp_target = test_u.to(device), test_v.to(device), tmp_target.to(device)
            val_output = model.forward(test_u, test_v)
            tmp_pred.append(list(val_output.data.cpu().numpy()))
            target.append(list(tmp_target.data.cpu().numpy()))
    tmp_pred = np.array(sum(tmp_pred, []))
    target = np.array(sum(target, []))
    expected_rmse = sqrt(mean_squared_error(tmp_pred, target))
    mae = mean_absolute_error(tmp_pred, target)
    return expected_rmse, mae

In [7]:
# Params
batch_size=128
embed_dim=64
lr=0.001
test_batch_size=1000
epochs=100

In [8]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
use_cuda = False
if torch.cuda.is_available():
    use_cuda = True
device = torch.device("cuda" if use_cuda else "cpu")

In [9]:
user_encoder = LabelEncoder()
venue_encoder = LabelEncoder()

In [10]:
#Data

In [11]:
conn = sqlite3.connect('/home/vikash/scb-test/fsdata.db/fsdata.db')


In [12]:
checkins=pd.read_sql_query("select user_id,venue_id from checkins",conn)

In [13]:
max_checkins=checkins.groupby(['user_id']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [14]:
top_10_checkins=list(max_checkins[:10].user_id)

In [15]:
top_10_checkins

[1348362,
 1900906,
 1326476,
 386648,
 1365850,
 467043,
 651415,
 304865,
 8622,
 439413]

In [16]:
socialgraph=pd.read_sql_query("select first_user_id,second_user_id from socialgraph  where first_user_id>1 and second_user_id>1", conn)

In [17]:
socialgraph

Unnamed: 0,first_user_id,second_user_id
0,2,38
1,38,2
2,2,39
3,39,2
4,2,40
...,...,...
27098409,456244,97074
27098410,97074,186390
27098411,186390,97074
27098412,97074,143776


In [18]:
max_checkins_friends=socialgraph[socialgraph['first_user_id'].isin(top_10_checkins)]

In [19]:
friends=max_checkins_friends['second_user_id']

In [20]:
total_users=np.union1d(friends, top_10_checkins)

In [21]:
#social_adj_lists_orig=socialgraph.groupby('first_user_id')['second_user_id'].apply(list).to_dict()

In [22]:
#ratings

In [23]:
ratings=pd.read_sql_query("select user_id,venue_id,rating from ratings", conn)

In [24]:
ratings

Unnamed: 0,user_id,venue_id,rating
0,1,1,5
1,1,51,4
2,1,51,2
3,1,51,5
4,1,52,5
...,...,...,...
2809575,2153498,91385,2
2809576,2153499,783,2
2809577,2153500,91385,2
2809578,2153501,68691,2


In [25]:
ratings_filtered=ratings[ratings['user_id'].isin(total_users)]

In [26]:
user_encoder.fit(total_users)
venue_encoder.fit(ratings_filtered.venue_id)

LabelEncoder()

In [27]:
user_encoder

LabelEncoder()

In [28]:
ratings_filtered['user_id']=user_encoder.transform(ratings_filtered['user_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_filtered['user_id']=user_encoder.transform(ratings_filtered['user_id'])


In [29]:
ratings_filtered['venue_id']=venue_encoder.transform(ratings_filtered['venue_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_filtered['venue_id']=venue_encoder.transform(ratings_filtered['venue_id'])


In [54]:
ratings_filtered.user_id.unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        42,  43,  44,  45,  46,  47,  48,  50,  51,  52,  53,  54,  55,
        56,  57,  58,  59,  60,  61,  63,  64,  65,  66,  67,  68,  69,
        70,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  90,  93,  94,  95,  97,  98,  99,
       100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 119, 121, 123, 124, 125, 127, 128, 129, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145,
       146, 147, 149, 150, 153, 155, 156, 158, 161, 163, 164, 165, 167,
       168, 169, 170, 171, 172, 173, 174, 180, 183, 184, 186, 187, 188,
       189, 190, 191, 192, 193, 194, 195, 198, 199, 200, 201, 202, 204,
       205, 206, 208, 212, 216, 217, 218, 219, 220, 221, 222, 22

In [31]:
#social graph
max_checkins_friends

Unnamed: 0,first_user_id,second_user_id
17993,8622,38
121117,8622,47
208795,8622,50
296083,8622,56
395857,8622,58
...,...,...
26624499,439413,13372
26624677,439413,13372
26624802,439413,13372
26624972,439413,13372


In [32]:
max_checkins_friends['first_user_id']=user_encoder.transform(max_checkins_friends['first_user_id'])
max_checkins_friends['second_user_id']=user_encoder.transform(max_checkins_friends['second_user_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  max_checkins_friends['first_user_id']=user_encoder.transform(max_checkins_friends['first_user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  max_checkins_friends['second_user_id']=user_encoder.transform(max_checkins_friends['second_user_id'])


In [33]:
social_adj_lists = max_checkins_friends.groupby('first_user_id')['second_user_id'].apply(list).to_dict()

In [34]:
# history_u_lists, history_ur_lists:  user's ratings history (item set in training set), and his/her rating score (dict)

In [35]:
history_u_lists=ratings_filtered.groupby('user_id')['venue_id'].apply(list).to_dict()

In [36]:
history_ur_lists=ratings_filtered.groupby('user_id')['rating'].apply(list).to_dict()

In [37]:
# history_v_lists, history_vr_lists:  user set (in training set) who have interacted with the item, and rating score (dict)

In [38]:
history_v_lists=ratings_filtered.groupby('venue_id')['user_id'].apply(list).to_dict()

In [39]:
history_vr_lists=ratings_filtered.groupby('venue_id')['rating'].apply(list).to_dict()

In [40]:
# rating unique values

In [41]:
ratings_filtered.rating.unique()

array([4, 3, 5, 2])

In [42]:
ratings_list = {
    2:0,3:1,4:2,5:3
}

In [43]:
ratings_list

{2: 0, 3: 1, 4: 2, 5: 3}

In [44]:
#train test split

In [45]:
X_train, X_test = train_test_split(ratings_filtered, test_size=0.20, random_state=42)

In [46]:
train_u, train_v, train_r = list(X_train.user_id),list(X_train.venue_id),list(X_train.rating)

In [47]:
test_u, test_v, test_r = list(X_test.user_id),list(X_test.venue_id),list(X_test.rating)

In [48]:
#data loader

In [49]:
trainset = torch.utils.data.TensorDataset(torch.LongTensor(train_u), torch.LongTensor(train_v),
                                              torch.FloatTensor(train_r))
testset = torch.utils.data.TensorDataset(torch.LongTensor(test_u), torch.LongTensor(test_v),
                                             torch.FloatTensor(test_r))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=test_batch_size, shuffle=True)
num_users = history_u_lists.__len__()
num_items = history_v_lists.__len__()
num_ratings = ratings_list.__len__()

In [50]:
#embeddings

In [51]:
u2e = nn.Embedding(num_users, embed_dim).to(device)
v2e = nn.Embedding(num_items, embed_dim).to(device)
r2e = nn.Embedding(num_ratings, embed_dim).to(device)

In [52]:
# user feature
# features: user * rating
agg_u_history = UV_Aggregator(v2e, r2e, u2e, embed_dim, cuda=device, uv=True)
enc_u_history = UV_Encoder(u2e, embed_dim, history_u_lists, history_ur_lists, agg_u_history, cuda=device, uv=True)
# neighobrs
agg_u_social = Social_Aggregator(lambda nodes: enc_u_history(nodes).t(), u2e, embed_dim, cuda=device)
enc_u = Social_Encoder(lambda nodes: enc_u_history(nodes).t(), embed_dim, social_adj_lists, agg_u_social,
                       base_model=enc_u_history, cuda=device)

# item feature: user * rating
agg_v_history = UV_Aggregator(v2e, r2e, u2e, embed_dim, cuda=device, uv=False)
enc_v_history = UV_Encoder(v2e, embed_dim, history_v_lists, history_vr_lists, agg_v_history, cuda=device, uv=False)

In [53]:
# model
graphrec = GraphRec(enc_u, enc_v_history, r2e).to(device)
optimizer = torch.optim.RMSprop(graphrec.parameters(), lr=lr, alpha=0.9)

best_rmse = 9999.0
best_mae = 9999.0
endure_count = 0

for epoch in range(1, epochs + 1):

    train(graphrec, device, train_loader, optimizer, epoch, best_rmse, best_mae)
    expected_rmse, mae = test(graphrec, device, test_loader)
    # please add the validation set to tune the hyper-parameters based on your datasets.

    # early stopping (no validation set in toy dataset)
    if best_rmse > expected_rmse:
        best_rmse = expected_rmse
        best_mae = mae
        endure_count = 0
    else:
        endure_count += 1
    print("rmse: %.4f, mae:%.4f " % (expected_rmse, mae))

    if endure_count > 5:
        break


IndexError: index 4 is out of bounds for dimension 0 with size 4