In [1]:
import torch
import scipy
import numpy as np
from torch.utils.data import Dataset, DataLoader
import random
from tqdm.notebook import tqdm, trange
import torch.nn
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split

train_path = "../train.csv"
n_factors = 64
learning_rate = 0.0005
wd =  0.0005
batch_size = 128
n_epochs = 8
use_BPR = True
n_ng = 4

In [2]:
class RatingsDataset(Dataset):
    def __init__(self, ratings_list, ratings_set, n_users = 4454, n_items = 3260, n_ng = 4, is_training=None):
        """
        Ratings is a scipy sparse matrix, list of keys (each of which have val of 1)
        """
        self.ratings_list = ratings_list
        if is_training:
            self.ratings_list = ratings_list * n_ng
        self.ratings_set = ratings_set
        self.n_users = n_users
        self.n_items = n_items
        self.n_ng = n_ng
        self.is_training = is_training
        self.ratings_with_ng = []
        if self.is_training:
            self.get_ng()
            
            
    def __len__(self):
        if self.is_training:
            return len(self.ratings_with_ng)
        else:
            return len(self.ratings_list)
    
    def get_ng(self):
        for user, pos_item in self.ratings_list:
            for _ in range(n_ng):
                neg_item = np.random.randint(self.n_items)
                while (user, neg_item) in self.ratings_set:
                    neg_item = np.random.randint(self.n_items)
                self.ratings_with_ng.append((user, pos_item, neg_item))
        
    
    def __getitem__(self, idx):
        if self.is_training:
            return self.ratings_with_ng[idx]
        return self.ratings_list[idx]

In [3]:
class MF(torch.nn.Module):## Model for task 1
    def __init__(self, n_factors = 16, n_users = 4454, n_items = 3260):
        super(MF, self).__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        
    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [4]:
ratings_list = []
n_users = 0
n_items = 0
with open(train_path) as f:
    for no, line in enumerate(f):
        if no == 0: continue
        line = line.split(",")
        id = int(line[0])
        items = line[1].split()
        for item in items:
            ratings_list.append((id, int(item)))
            if id > n_users:
                n_users  = id
            if int(item) > n_items:
                n_items = int(item)


n_users += 1
n_items += 1

ratings_set = set(ratings_list)
            

In [5]:
train_ratings_list, val_ratings_list =  train_test_split(ratings_list, test_size=0.1)

print(len(train_ratings_list))
print(len(val_ratings_list))

train_dataset = RatingsDataset(train_ratings_list, ratings_set, n_ng = n_ng, is_training = True)
val_dataset = RatingsDataset(val_ratings_list, ratings_set, is_training = False)

print(len(train_dataset))
print(len(val_dataset))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

278169
30908
4450704
30908


In [None]:
model = MF()
if not use_BPR:
    criterion = torch.nn.BCEWithLogitsLoss()
else:
    criterion = None
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= wd)


best_loss = 100

t = trange(n_epochs, desc='{} / {}: {}'.format(0, n_epochs, 1), leave=True)
for e in t:
    model.train()
    
    if not use_BPR:
        for batch_num, batch in enumerate(train_loader):
            user, pos_item, neg_item = batch
            
            
            pos_label = torch.ones(pos_item.shape)
            neg_label = torch.zeros(neg_item.shape)

            optimizer.zero_grad() 
            pos_pred = model(user, pos_item)
            loss = criterion(pos_pred, pos_label)
#             pos_losses.append(loss)
            loss.backward()
            optimizer.step()

            optimizer.zero_grad() 
            neg_pred = model(user, neg_item)
            loss = criterion(neg_pred, neg_label)
#             neg_losses.append(loss)
            loss.backward()
            optimizer.step()
    else:
        for batch_num, batch in enumerate(train_loader):
            user, pos_item, neg_item = batch
            pos_label = torch.ones(pos_item.shape)
            neg_label = torch.zeros(neg_item.shape)
            
            optimizer.zero_grad()
            pos_pred = model(user, pos_item)
            neg_pred = model(user, neg_item)
            
            loss = -(pos_pred-neg_pred).sigmoid().log().sum()
#             pos_losses.append(loss)
            loss.backward()
            optimizer.step()
            
    val_acc_sum = 0
    val_acc_cnt = 0
    val_loss_sum = 0
    model.eval()
    for batch_num, batch in enumerate(val_loader):
        user, pos_item = batch
        neg_item = np.random.randint(n_items)
        while (user, neg_item) in ratings_set:
            neg_item = np.random.randint(n_items)
        pos_pred = model(user, pos_item)
        neg_pred = model(user, torch.tensor(neg_item))
        val_loss_sum += -(pos_pred-neg_pred).sigmoid().log().sum()
        val_acc_cnt += user.shape[0]
    loss = val_loss_sum.item() / val_acc_cnt
    print("val loss: {:.4f}".format(loss))
    if loss < best_loss:
        best_loss = loss
        torch.save(model.state_dict(), "model-{}.pt".format(n_factors))

HBox(children=(FloatProgress(value=0.0, description='0 / 8: 1', max=8.0, style=ProgressStyle(description_width…

val loss: 0.7965


In [None]:
print(best_loss)
model.load_state_dict(torch.load("model-{}.pt".format(n_factors)))
model.eval()

answer = [[] for i in range(n_users)]

for user in tqdm(range(n_users)):
    model_items = [item for item in range(n_items) if (user, item) not in ratings_set]
    model_users = [user] * len(model_items)
    
    preds = model(torch.tensor(model_users), torch.tensor(model_items))
    indices = torch.topk(preds, 50, sorted=True).indices.tolist()
    answer[user] = [model_items[i] for i in indices]

print(answer[user])

In [None]:
with open("submission.csv", 'w') as f:
        f.write('UserId,ItemId\n')
        for user in range(len(answer)):
            f.write('{},{}\n'.format(str(user), ' '.join(str(a) for a in answer[user])))