In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parameter as P
import torch.optim as optim
import numpy as np
import random

In [11]:
use_gpu = False
if torch.cuda.is_available() and use_gpu:
    device = 'cuda'
else:
    device = 'cpu'
print('Using device: ', device)

Using device:  cpu


In [12]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, data):
        'Initialization'
        self.data = data

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        data_item = self.data[index]

        # Load data and get label
        X = data_item[:2]
        X = torch.tensor(X).to(device)
        y = data_item[2]
        y = torch.tensor(y).to(device).float()

        return X, y

In [15]:
userIds = set()
movieIds = set()
triplets = []

# Parameters
params = {'batch_size': 64,
          'shuffle': True,
         'drop_last': True}

file1 = open('../../ml-100k/u.data', 'r')
for line in file1.readlines():
    uid, mid, rating, timestamp = line.split('	')
    userIds.add(int(uid))
    movieIds.add(int(mid))
    triplets.append([uid,mid,rating])
    
random.shuffle(triplets)

triplets = np.array(triplets, dtype='int')

train_val_split = int(len(triplets)*0.95)
train_triplets = triplets[:train_val_split]
val_triplets = triplets[train_val_split:]

test_split = int(len(val_triplets)/2)

test_triplets = val_triplets[:test_split]
val_triplets = val_triplets[test_split:]


training_set = Dataset(train_triplets)
val_set = Dataset(val_triplets)
test_set = Dataset(test_triplets)

train_gen = torch.utils.data.DataLoader(training_set, **params)
val_gen = torch.utils.data.DataLoader(val_set, **params)
test_gen = torch.utils.data.DataLoader(test_set, **params)

numUsers = max(userIds)
numItems = max(movieIds)

In [17]:
def weights_init_uniform(m):
    classname = m.__class__.__name__
    
    # for every Linear layer in a model..
    if classname.find('Linear') != -1:
        weight_range = 4.0 * pow(6, 0.5) / pow(m.in_features + m.out_features, 0.5)
        # apply a paper distribution to the weights and a bias=0
        m.weight.data.uniform_(-1 * weight_range, weight_range)
        m.bias.data.fill_(0)


In [18]:
class FCMF(nn.Module):
    '''
    Base class for Fully-Connected Matrix Factorization networks
    '''

    def __init__ (self, N, M, D, D_, K, layers):
        '''
        variable definitions taken from paper: https://arxiv.org/pdf/1511.06443.pdf
        
        @param N:  Number of users
        @param M:  Number of items
        @param D:  size of latent-feature vectors
        @param D_: num rows in latent-features matrices
        @param K:  num cols in latent-feature matrices
        
        @param layers: list of hidden layer sizes; does not include input or output
        '''
        
        assert (min(N,M,D,D_,K) > 0), "Params must be nonzero and positive"
        assert (len(layers) > 0),     "Must have nonzero hidden layers"
        
        ########################################################################
        
        super(FCMF, self).__init__()

        
        self.N, self.M, self.D, self.D_, self.K = N, M, D, D_, K

        
        self.userLatentVectors = P.Parameter(torch.rand(N,D, requires_grad=True))
        self.itemLatentVectors = P.Parameter(torch.rand(M,D, requires_grad=True))

        
        self.userLatentMatrices = P.Parameter(torch.rand(N,D_,K, requires_grad=True))
        self.itemLatentMatrices = P.Parameter(torch.rand(M,D_,K, requires_grad=True))

        
        linear_inputs = [2*D + D_] + layers
        linear_outputs = layers + [1]

        self.layers = nn.ModuleList([nn.Linear(i,o) for (i,o) in zip(linear_inputs, linear_outputs)])
        #Initialize weights as specified in paper
        self.apply(weights_init_uniform)
        

        
    def forward(self, x):
        '''
        @param x: let this be a tensor of size (X, 2): (user index, item index)
        
        WARNING: 
            - forward currently does not account for user/items outside of training data
            - mitigations include returning smart averages    
        '''   
        #Get 100 user and item indices
        userIndices, itemIndices = x[:,0].long(), x[:,1].long()
        
        
        #Select the 10 dimensional rows for each user and item        
        userLatMats = self.userLatentMatrices[userIndices]
        itemLatMats = self.itemLatentMatrices[itemIndices]

        #Take the product of these and sum it to get the feature U'n,1 * V'm,1 + ... + U'n,D * V'm,D
        latentDotProducts = torch.sum(userLatMats * itemLatMats, dim=-1)
        
        x = torch.cat([
            #D
            self.userLatentVectors[userIndices],
            #D
            self.itemLatentVectors[itemIndices],
            #D'
            latentDotProducts
        ], dim=1)
        
        

        
        for l in self.layers[:-1]:
            x = torch.sigmoid(l(x))
        
        # TODO: should last layer go through a sigmoid? NO!
        return self.layers[-1](x)
    
    def gradAll(self):
        self._setGrads(True, True, True, True, True)
    
    def gradNetwork(self):
        self._setGrads(False, False, False, False, True)

    def gradLatent(self):
        self._setGrads(True, True, True, True, False)

    def _setGrads(self, userVec, itemVec, userMat, itemMat, net):
        self.userLatentVectors.requires_grad = userVec
        self.itemLatentVectors.requires_grad = itemVec
        
        self.userLatentMatrices.requires_grad = userMat
        self.itemLatentMatrices.requires_grad = itemMat
        
        self.layers.requires_grad = net
        
        
        

        

In [21]:
def getBatches(X, Y, usersPerBatch=100):
    '''
    batchSize = min(N - start, usersPerBatch) * M
    '''
    N = X.shape[0]
    
    start = 0
    while start < N:

        if start+usersPerBatch + 1 < N:
            batch_x = torch.tensor(X[start:start+usersPerBatch]).to(device)
            batch_y = torch.tensor(Y[start:start+usersPerBatch]).to(device).float()
            start += usersPerBatch
            yield (batch_x, batch_y)

        else:
            batch_x = torch.tensor(X[start:]).to(device)
            batch_y = torch.tensor(Y[start:]).to(device).float()
            start += usersPerBatch
            yield (batch_x, batch_y)
            
def trainEpoch(opt, criterion, model):
    model.train()
    opt.zero_grad()
    totalLoss = 0

    for batch_x, batch_y in train_gen:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        optimizer.zero_grad()
        pred_y = model(batch_x)
        #RMSE in order to compare to paper
        loss = pow(criterion(batch_y, pred_y.flatten()), 0.5)
        totalLoss += loss
        loss.backward()
        optimizer.step()

    return totalLoss/len(train_gen)

def evaluate(criterion, model):
    model.eval()
    loss = 0
    accuracy = 0
    for batch_x, batch_y in val_gen:
        pred_y = model(batch_x)
        loss += pow(criterion(batch_y, pred_y.flatten()), 0.5)
        matching = (torch.round(pred_y.detach()).flatten() == batch_y.flatten()).type(torch.uint8).sum()
        matching = matching
        accuracy += matching
    return loss/len(val_gen), accuracy.item()/len(val_set)


def calculate_test_loss(criterion, model):
    model.eval()
    loss = 0
    accuracy = 0
    for batch_x, batch_y in test_gen:
        pred_y = model(batch_x)
        loss += pow(criterion(batch_y, pred_y.flatten()), 0.5)
        matching = (torch.round(pred_y.detach()).flatten() == batch_y.flatten()).type(torch.uint8).sum()
        matching = matching
        accuracy += matching
    return loss/len(test_gen)


In [26]:
# Paper uses RMSE as objective and RMSProp optimizer


test_losses = []
Ks = []
K = 1
while K <= 50:
    fc3 = FCMF(numUsers+1, numItems+1 ,10,60,K,[50, 50, 50]).to(device)
    criterion = nn.MSELoss() 
    optimizer = optim.RMSprop(fc3.parameters(), lr=0.001)
    epochs = 0
    max_epochs = 25
    best_test_loss = 10000
    while epochs < max_epochs:
        fc3.gradAll()
        fc3.gradLatent()
        loss = trainEpoch(optimizer, criterion, fc3)
        fc3.gradNetwork()
        loss += trainEpoch(optimizer, criterion, fc3)
        val_loss, val_acc = evaluate(criterion, fc3)
        test_loss = calculate_test_loss(criterion, fc3)
        if epochs % 5 == 0:
            print("Epoch {} Train Loss: {} Val Loss: {} Test Loss: {}".format(epochs,loss, val_loss, test_loss))
        if test_loss < best_test_loss:
            best_test_loss = test_loss
        epochs+=1
    test_losses.append(best_test_loss)
    Ks.append(K)
    print("K: {} Best Test Loss: {}".format(K, best_test_loss))
    if K == 1:
        K = 5
    else:
        K += 5



Epoch 0 Train Loss: 1.9696478843688965 Val Loss: 0.9828413724899292 Test Loss: 0.9759077429771423
Epoch 5 Train Loss: 1.6964131593704224 Val Loss: 0.9713844060897827 Test Loss: 0.9574007987976074
Epoch 10 Train Loss: 1.4302937984466553 Val Loss: 1.0696138143539429 Test Loss: 1.0417087078094482
Epoch 15 Train Loss: 1.171729564666748 Val Loss: 1.0844494104385376 Test Loss: 1.0602811574935913
Epoch 20 Train Loss: 0.9651563167572021 Val Loss: 1.1333884000778198 Test Loss: 1.109114170074463
K: 1 Best Test Loss: 0.9399153590202332
Epoch 0 Train Loss: 1.9267122745513916 Val Loss: 0.9671722650527954 Test Loss: 0.9557816982269287
Epoch 5 Train Loss: 1.3767738342285156 Val Loss: 1.038909673690796 Test Loss: 1.0429307222366333
Epoch 10 Train Loss: 0.8900201320648193 Val Loss: 1.1045626401901245 Test Loss: 1.116472601890564
Epoch 15 Train Loss: 0.5983258485794067 Val Loss: 1.1232383251190186 Test Loss: 1.1369060277938843
Epoch 20 Train Loss: 0.462705135345459 Val Loss: 1.1294316053390503 Test Loss

In [43]:
f = open("K_sweep_5-50", "w")
for l,k in zip(test_losses, Ks):
    f.write("{},{}\n".format(k,l))
f.close()