In [295]:
import torch
import scipy
import numpy as np
from torch.utils.data import Dataset, DataLoader
import random
from tqdm.notebook import tqdm, trange
import torch.nn

train_path = "../train.csv"
n_users= 4454
n_items = 3260
n_factors = 16
learning_rate = 0.01
wd = 0.01
batch_size = 128
n_epochs = 30
use_BPR = True
n_ng = 4

In [296]:
class RatingsDataset(Dataset):
    def __init__(self, ratings, n_users = 4454, n_items = 3260, n_ng = 4, transform=None):
        self.ratings = ratings
        self.length = sum(len(row) for row in ratings)
        self.n_users = n_users
        self.n_items = n_items
        self.n_ng = n_ng

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        
        user = random.randrange(n_users)
        pos_item = random.choice(self.ratings[user])
        neg_item = random.randrange(n_items)
        while neg_item in self.ratings[user]:
            neg_item = random.randrange(n_items)

        return user, pos_item, neg_item

In [297]:
class MF(torch.nn.Module):## Model for task 1
    def __init__(self, n_factors = 16, n_users = 4454, n_items = 3260):
        super(MF, self).__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        
    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)
    

In [299]:
ratings = []
with open(train_path) as f:
    for no, line in enumerate(f):
        if no == 0: continue
        line = line.split(",")
        id = int(line[0])
        items = [int(i) for i in line[1].split()]
        ratings.append(items)

ratingsDataset = RatingsDataset(ratings, n_ng = n_ng)
print(len(ratingsDataset))
train_loader = DataLoader(ratingsDataset, batch_size=batch_size,
                        shuffle=True, num_workers=0)

model = MF()
if not use_BPR:
    criterion = torch.nn.BCEWithLogitsLoss()
else:
    criterion = None
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= wd)

pos_losses = []
neg_losses = []

t = trange(n_epochs, desc='{} / {}: {}'.format(0, n_epochs, 1), leave=True)
for e in t:
    model.train()
#     for batch_num, batch in enumerate(tqdm(train_loader, leave = False, desc = "{} / {}".format(e, n_epochs))):
    for batch_num, batch in enumerate(train_loader):
        user, pos_item, neg_item = batch
        pos_label = torch.ones(pos_item.shape)
        neg_label = torch.zeros(neg_item.shape)
        
        if not use_BPR:
            optimizer.zero_grad() 
            pos_pred = model(user, pos_item)
            loss = criterion(pos_pred, pos_label)
            pos_losses.append(loss)
            loss.backward()
            optimizer.step()

            optimizer.zero_grad() 
            neg_pred = model(user, neg_item)
            loss = criterion(neg_pred, neg_label)
            neg_losses.append(loss)
            loss.backward()
            optimizer.step()
        
        else:
            optimizer.zero_grad()
            pos_pred = model(user, pos_item)
            neg_pred = model(user, neg_item)
            
            loss = -(pos_pred-neg_pred).sigmoid().log().sum()
            pos_losses.append(loss)
            loss.backward()
            optimizer.step()
        
        if batch_num % 100 == 0:
            t.set_description('{} / {}: {:.1f}'.format(e, n_epochs, loss))
            
torch.save(model.state_dict(), "model-{}.pt".format(n_factors))


309077


HBox(children=(FloatProgress(value=0.0, description='0 / 30: 1', max=30.0, style=ProgressStyle(description_wid…

KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot as plt
plt.plot(neg_losses)
plt.show()
plt.plot(pos_losses)
plt.show()

In [None]:
model.load_state_dict(torch.load("model-{}.pt".format(n_factors)))
model.eval()

answer = [[] for i in range(n_users)]

for user in tqdm(range(n_users)):
    model_items = [item for item in range(n_items) if item not in ratings[user]]
    model_users = [user] * len(model_items)
    
    preds = model(torch.tensor(model_users), torch.tensor(model_items))
    indices = torch.topk(preds, 50, sorted=True).indices.tolist()
    answer[user] = [model_items[i] for i in indices]

print(answer[user])

In [None]:
with open("submission.csv", 'w') as f:
        f.write('UserId,ItemId\n')
        for user in range(len(answer)):
            f.write('{},{}\n'.format(str(user), ' '.join(str(a) for a in answer[user])))
