In [8]:
import random
import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

item_dict = np.load('/data/private/Arena/prepro_results/item_dict.npy', allow_pickle=True).item()
item_list = np.load('/data/private/Arena/prepro_results/item_list.npy')
keyword_dict = np.load('/data/private/Arena/prepro_results/keyword_dict.npy', allow_pickle=True).item()
keyword_list = np.load('/data/private/Arena/prepro_results/keyword_list.npy')
id2reader = np.load('/data/private/Arena/prepro_results/id2reader.npy')
reader2id = np.load('/data/private/Arena/prepro_results/reader2id.npy', allow_pickle=True).item()
id2writer = np.load('/data/private/Arena/prepro_results/id2writer.npy')
writer2id = np.load('/data/private/Arena/prepro_results/writer2id.npy', allow_pickle=True).item()
item2keywd = np.load('/data/private/Arena/prepro_results/item2keywd.npy', allow_pickle=True).item()
keyword_dict['없음'] = len(keyword_list)
keyword_list = list(keyword_list)
keyword_list.append('없음')

num_keywords = len(keyword_dict)
num_readers = len(id2reader)
num_writers = len(id2writer)
print(num_keywords, num_readers, num_writers)

96892 310758 19065


In [9]:
import ml_metrics as metrics

class GMF(torch.nn.Module):
    def __init__(self, num_users, num_items, num_keywd, latent_dim):
        super(GMF, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_keywd = num_keywd
        self.latent_dim = latent_dim

        self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=2 * self.latent_dim)
        self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)
        self.embedding_keywd = torch.nn.Embedding(num_embeddings=self.num_keywd, embedding_dim=self.latent_dim)

        self.item4valid = None
        
        self.affine_output = torch.nn.Linear(in_features=self.latent_dim, out_features=1)
        self.logistic = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices, item_keywd, negs_indices, negs_keywd):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        negs_embedding = self.embedding_item(negs_indices)
        item_kw_embedding = self.embedding_keywd(item_keywd)
        negs_kw_embedding = self.embedding_keywd(negs_keywd)
        
        element_product = torch.mul(user_embedding, torch.cat([item_embedding, item_kw_embedding], 1))
        pos_logits = self.affine_output(element_product)
        
        element_product = torch.mul(user_embedding, torch.cat([negs_embedding, negs_kw_embedding], 1))
        neg_logits = self.affine_output(element_product)
        loss = - self.logistic(pos_logits) + self.logistic(neg_logits)
        
        return torch.sum(loss)
    
    def predict(self, user_indices, top=100):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(self.item4valid[:,0])
        keywd_embedding = self.embedding_keywd(self.item4valid[:,1])
        
        element_product = torch.mul(user_embedding, torch.cat([item_embedding, keywd_embedding], 1))
        logits = self.affine_output(element_product)
        print(logits.size())
        _, sorted_indices = torch.sort(logits, 0, descending=True)
        sorted_indices = sorted_indices.squeeze()[:100].cpu().numpy().tolist()
        
        return sorted_indices

    def set_item4valid(self, item4valid):
        self.item4valid = item4valid
        
    def init_weight(self):
        pass

In [10]:
import os
read_path = '/data/private/Arena/datasets/read/'
read_files = os.listdir(read_path)
train_read_files = read_files[:int(len(read_files)*0.8)]
valid_read_files = read_files[int(len(read_files)*0.8):]

In [11]:
train_subs_keywd = []; train_subs = []; train_negs_keywd = []; train_negs = []; train_reader = []
for read_file in tqdm.tqdm_notebook(train_read_files):
    file = open(read_path+read_file, 'r')
    data = file.readlines()
    for line in data:
        line = line.split(' ')
        try:
            train_reader += [reader2id[line[0]]]*len(line[1:-1])
            trian_subs_keywd += [keyword_dict[item2keywd[x][0] if item2keywd[x][0] is True else '없음'] for x in line[1:-1]]
            train_subs += [writer2id[x.split('_')[0]] for x in line[1:-1]]
            train_negs_keywd += random.sample(list(set(range(num_keywords))-set(subs_keywd)), len(subs_keywd))
            train_negs += random.sample(list(set(range(num_writers))-set(subs)), len(subs))
        except:
            continue

train_subs_keywd = torch.from_numpy(np.array(train_subs_keywd)).cuda()
train_subs = torch.from_numpy(np.array(train_subs)).cuda()
train_negs_keywd = torch.from_numpy(np.array(train_negs_keywd)).cuda()
train_negs = torch.from_numpy(np.array(train_negs)).cuda()
train_reader = torch.from_numpy(np.array(train_reader)).cuda()

HBox(children=(IntProgress(value=0, max=2900), HTML(value='')))




In [None]:
valid_subs = []; valid_reader = []
for read_file in tqdm.tqdm_notebook(valid_read_files):
    file = open(read_path+read_file, 'r')
    data = file.readlines()
    for line in data:
        line = line.split(' ')
        try:
            valid_subs += [item_dict[x] for x in line[1:-1]]
            valid_reader += [reader2id[line[0]]]*len(valid_subs)
        except:
            continue

valid_subs = torch.from_numpy(np.array(valid_subs)).cuda()
valid_reader = torch.from_numpy(np.array(valid_reader)).cuda()

HBox(children=(IntProgress(value=0, max=726), HTML(value='')))

In [4]:
num_epochs = 10
learning_rate = 0.0001
hidden_dim = 128
val_step = 1
batch_size = 4096

In [5]:
model = GMF(num_readers, num_writers, num_keywords, hidden_dim).cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])

print(model)
print('# of params : ', params)

GMF(
  (embedding_user): Embedding(310758, 256)
  (embedding_item): Embedding(19065, 128)
  (embedding_keywd): Embedding(96892, 128)
  (affine_output): Linear(in_features=128, out_features=1, bias=True)
  (logistic): Sigmoid()
)
# of params :  94396673


In [6]:
item4valid = []
for i, item in enumerate(item_list):
    
    writer = writer2id[item.split('_')[0]]
    keyword = keyword_dict[item2keywd[item][0]]
    item4valid.append([writer, keyword])
item4valid = torch.from_numpy(np.array(item4valid)).cuda()
print(item4valid.size())
model.set_item4valid(item4valid)

torch.Size([643104, 2])


In [None]:

for epoch in range(num_epochs):
    model.train()
    for read_file in tqdm.tqdm_notebook(train_read_files):
        file = open(read_path+read_file, 'r')
        data = file.readlines()
        for line in data:
            line = line.split(' ')
            try:
                subs_keywd += [keyword_dict[item2keywd[x][0] if item2keywd[x][0] is True else '없음'] for x in line[1:-1]]
                subs += [writer2id[x.split('_')[0]] for x in line[1:-1]]
                negs_keywd += random.sample(list(set(range(num_keywords))-set(subs_keywd)), len(subs_keywd))
                negs += random.sample(list(set(range(num_writers))-set(subs)), len(subs))
                reader += [reader2id[line[0]]]*len(sub)
            except:
                continue
            
            if len(reader) < batch_size:
                continue
                
            reader = torch.from_numpy(np.array([reader]*len(subs))).cuda()
            subs_keywd = torch.from_numpy(np.array(subs_keywd)).cuda()
            subs = torch.from_numpy(np.array(subs)).cuda()
            negs_keywd = torch.from_numpy(np.array(negs_keywd)).cuda()
            negs = torch.from_numpy(np.array(negs)).cuda()
            try:
                loss = model(reader, subs, subs_keywd, negs, negs_keywd)
            except:
                continue
            
            model.zero_grad()
            loss.backward()
            optimizer.step()
            
            subs_keywd = []; subs = []; negs_keywd = []; negs = []; reader = []
            
    reader = torch.from_numpy(np.array([reader]*len(subs))).cuda()
    subs_keywd = torch.from_numpy(np.array(subs_keywd)).cuda()
    subs = torch.from_numpy(np.array(subs)).cuda()
    negs_keywd = torch.from_numpy(np.array(negs_keywd)).cuda()
    negs = torch.from_numpy(np.array(negs)).cuda()
    try:
        loss = model(reader, subs, subs_keywd, negs, negs_keywd)
    except:
        pass

    model.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1)%val_step == 0:
        with torch.no_grad():
            valid_loss = 0
            model.eval()
            i = 0
            for read_file in valid_read_files[:2]:
                file = open(read_path+read_file, 'r')
                data = file.readlines()
                for line in data:
                    line = line.split(' ')
                    try:
                        reader = torch.from_numpy(np.array([reader2id[line[0]]])).cuda()
                        subs = [item_dict[writer] for writer in line[1:-1]]
                    except:
                        continue
                        
                    preds = model.predict(reader)
                    loss = metrics.apk(subs, preds, 100)                    
                    valid_loss += loss
                    i += 1
                
            print('epoch: '+str(epoch+1)+' MAP: '+str(valid_loss/(i+1)))

HBox(children=(IntProgress(value=0, max=2900), HTML(value='')))

In [None]:
len(train_read_files), len(valid_read_files)