In [8]:
import torch
import scipy
import numpy as np
from torch.utils.data import Dataset, DataLoader
import random
from tqdm.notebook import tqdm, trange
import torch.nn
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split

train_path = "../train.csv"
n_factors = 512
learning_rate = 0.01
wd =  0.0001
batch_size = 4096
n_epochs = 100
use_BPR = True
n_ng = 1

In [9]:
class RatingsDataset(Dataset):
    def __init__(self, ratings_list, ratings_set, n_users = 4454, n_items = 3260, n_ng = 1, is_training=None):
        """
        Ratings is a scipy sparse matrix, list of keys (each of which have val of 1)
        """
        self.ratings_list = ratings_list
        self.ratings_set = ratings_set
        self.n_users = n_users
        self.n_items = n_items
        self.n_ng = n_ng
        self.is_training = is_training
            
    def __len__(self):
        return len(self.ratings_list)

    def __getitem__(self, idx):
        rating = self.ratings_list[idx]
        
        user, pos_item = rating
        neg_item = np.random.randint(self.n_items)
        while (user, neg_item) in self.ratings_set:
            neg_item = np.random.randint(self.n_items)
        return user, pos_item, neg_item
        

In [10]:
class MF(torch.nn.Module):## Model for task 1
    def __init__(self, n_factors = 16, n_users = 4454, n_items = 3260):
        super(MF, self).__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)

        torch.nn.init.normal_(self.user_factors.weight, std=0.01)
        torch.nn.init.normal_(self.item_factors.weight, std=0.01)

        print(n_factors, n_users, n_items)
        
    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [11]:
ratings_list = []
n_users = 0
n_items = 0
with open(train_path) as f:
    for no, line in enumerate(f):
        if no == 0: continue
        line = line.split(",")
        id = int(line[0])
        items = line[1].split()
        for item in items:
            ratings_list.append((id, int(item)))
            if id > n_users:
                n_users  = id
            if int(item) > n_items:
                n_items = int(item)


n_users += 1
n_items += 1

ratings_set = set(ratings_list)
            

In [12]:
def randomized_split(li, ratio):
    random.shuffle(data)
    train_data = data[:ratio * len(li)]
    test_data = data[ratio * len(li):]

train_ratings_list, val_ratings_list =  randomized_split(ratings_list, 0.9)

print(len(train_ratings_list))
print(len(val_ratings_list))

train_dataset = RatingsDataset(train_ratings_list, ratings_set, n_ng = n_ng, is_training = True)
val_dataset = RatingsDataset(val_ratings_list, ratings_set, is_training = False)

print(len(train_dataset))
print(len(val_dataset))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

278169
30908
278169
30908


In [13]:
print(ratings_list[:10])

[(0, 1938), (0, 490), (0, 128), (0, 1197), (0, 2893), (0, 2983), (0, 1861), (0, 1307), (0, 2547), (0, 2312)]


In [15]:
model = MF(n_factors = n_factors, n_users = n_users, n_items = n_items)
if not use_BPR:
    criterion = torch.nn.BCEWithLogitsLoss()
else:
    criterion = None
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= wd)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=wd)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
print(optimizer)
model.to(device)


best_loss = 100

t = trange(n_epochs, desc='{} / {}: {}'.format(0, n_epochs, 1), leave=True)
for e in t:
    model.train()
    
    if not use_BPR:
        for batch_num, batch in enumerate(train_loader):
            user, pos_item, neg_item = batch
            
            
            pos_label = torch.ones(pos_item.shape)
            neg_label = torch.zeros(neg_item.shape)

            optimizer.zero_grad() 
            pos_pred = model(user.to(device), pos_item.to(device))
            loss = criterion(pos_pred, pos_label)
            loss.backward()
            optimizer.step()

            optimizer.zero_grad() 
            neg_pred = model(user.to(device), neg_item.to(device))
            loss = criterion(neg_pred, neg_label)
            loss.backward()
            optimizer.step()
    else:
        avg_loss = 0
        avg_loss_cnt = 0
        for batch_num, batch in enumerate(train_loader):
            user, pos_item, neg_item = batch
            user = user.to(device)
            pos_item = pos_item.to(device)
            neg_item = neg_item.to(device)
            
            optimizer.zero_grad()
#             model.zero_grad()
            pos_pred = model(user, pos_item)
            neg_pred = model(user, neg_item)
            
            loss = -((pos_pred-neg_pred).sigmoid().log().sum())
            avg_loss += loss
            avg_loss_cnt += user.shape[0]
            
            loss.backward()
            optimizer.step()
            
        print("train loss: \t{:.4f}".format(avg_loss.item() / avg_loss_cnt), end = "\t")
    val_acc_sum = 0
    val_acc_cnt = 0
    val_loss_sum = 0
    model.eval()
    for batch_num, batch in enumerate(val_loader):
        user, pos_item, neg_item = batch
        user = user.to(device)
        pos_item = pos_item.to(device)
        neg_item = neg_item.to(device)
        pos_pred = model(user, pos_item)
        neg_pred = model(user, neg_item)
        val_loss_sum += -((pos_pred-neg_pred).sigmoid().log().sum())
        val_acc_cnt += user.shape[0]
    loss = val_loss_sum.item() / val_acc_cnt
    print("val loss: \t{:.4f}".format(loss))
    if loss < best_loss:
        best_loss = loss
        torch.save(model.state_dict(), "../model/model-best-{}.pt".format(n_factors))
    torch.save(model.state_dict(), "../model/model-last-{}.pt".format(n_factors))

512 4454 3260
cuda
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    momentum: 0
    nesterov: False
    weight_decay: 0.0001
)


HBox(children=(FloatProgress(value=0.0, description='0 / 100: 1', style=ProgressStyle(description_width='initi…

train loss: 	0.6931	val loss: 	0.6931
train loss: 	0.6928	val loss: 	0.6930
train loss: 	0.6921	val loss: 	0.6921
train loss: 	0.6899	val loss: 	0.6883
train loss: 	0.6808	val loss: 	0.6717
train loss: 	0.6477	val loss: 	0.6186
train loss: 	0.5738	val loss: 	0.5327


KeyboardInterrupt: 

In [None]:
print(best_loss)
print(optimizer)
model.load_state_dict(torch.load("model/model-last-{}.pt".format(n_factors)))
model.eval()

answer = [[] for i in range(n_users)]


for user in tqdm(range(n_users)):
    model_items = [item for item in range(n_items) if (user, item) not in ratings_set]
    model_users = [user] * len(model_items)
    
    preds = model(torch.tensor(model_users).to(device), torch.tensor(model_items).to(device))
    indices = torch.topk(preds, 50, sorted=True).indices.tolist()
    answer[user] = [model_items[i] for i in indices]

print(answer[user])

In [None]:
with open("submission.csv", 'w') as f:
        f.write('UserId,ItemId\n')
        for user in range(len(answer)):
            f.write('{},{}\n'.format(str(user), ' '.join(str(a) for a in answer[user])))

In [None]:
print("train loss: {:.4f}".format(avg_loss.item() / avg_loss_cnt))
print("val loss: {:.4f}".format(val_loss_sum.item() / val_acc_cnt))
print("optimizer: ", optimizer)
print("factors: ", n_factors)
print("batch size: ", batch_size)
print("epochs: ", n_epochs)