In [1]:
!ulimit -n 4096

In [4]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import tqdm
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
%matplotlib inline

## 行列因子分解

In [7]:
df = pd.read_csv("./data/ml-20m/ratings.csv")
X = df[["userId", "movieId"]].values
Y = df[["rating"]].values

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1)

train_dataset = TensorDataset(torch.tensor(train_X, dtype=torch.int64),
                              torch.tensor(train_Y, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(test_X, dtype=torch.int64),
                             torch.tensor(test_Y, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=1024, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=1024)

In [8]:
class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        out = torch.sum(user_feature * item_feature, 1)
        out = nn.functional.sigmoid(out) * 5
        return out

In [9]:
max_user, max_item = X.max(0)
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user+1, max_item+1)

In [10]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    net.eval()
    ys = []
    ypreds = []
    for x, y in loader:
        x = x.to(device)
        y = y.to(device)
        with torch.no_grad():
            y_pred = net(x).to(device).view(-1)
        ys.append(y)
        ypreds.append(y_pred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score.item()

In [12]:
from statistics import mean

net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    test_score = eval_net(net, test_loader, device="cuda:0")
    print("epoch:{}\tmean(loss_log):{:.3f}\ttest_score:{:.3f}".
         format(epoch, mean(loss_log), test_score), flush=True)

100%|██████████| 17579/17579 [01:07<00:00, 259.18it/s]


epoch:0	mean(loss_log):1.613	test_score:0.734


100%|██████████| 17579/17579 [01:07<00:00, 258.67it/s]


epoch:1	mean(loss_log):0.878	test_score:0.721


100%|██████████| 17579/17579 [01:08<00:00, 258.03it/s]


epoch:2	mean(loss_log):0.836	test_score:0.719


100%|██████████| 17579/17579 [01:08<00:00, 257.37it/s]


epoch:3	mean(loss_log):0.817	test_score:0.718


100%|██████████| 17579/17579 [01:07<00:00, 258.81it/s]


epoch:4	mean(loss_log):0.806	test_score:0.717


In [13]:
net.to("cpu")
query = (1, 10)
query = torch.tensor(query, dtype=torch.int64).view(1, -1)
net(query)

tensor([3.4899], grad_fn=<MulBackward>)

In [16]:
query = torch.stack([
    torch.zeros(max_item, dtype=torch.float32).fill_(1),
    torch.arange(1, max_item+1, dtype=torch.float32)
], 1).long()
scores, indices = torch.topk(net(query), 5)
print(scores, indices)

tensor([4.9998, 4.9997, 4.9997, 4.9996, 4.9995], grad_fn=<TopkBackward>) tensor([ 28640,  50299, 106272, 107305, 107184])


In [17]:
class NeuralMatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        out = torch.cat([user_feature, item_feature], 1)
        out = self.mlp(out)
        out = nn.functional.sigmoid(out) * 5
        return out.squeeze()

In [18]:
net = NeuralMatrixFactorization(max_user+1, max_item+1)
net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    test_score = eval_net(net, test_loader, device="cuda:0")
    print("epoch:{}\tmean(loss_log):{:.3f}\ttest_score:{:.3f}".
         format(epoch, mean(loss_log), test_score), flush=True)

100%|██████████| 17579/17579 [01:03<00:00, 275.38it/s]


epoch:0	mean(loss_log):0.755	test_score:0.646


100%|██████████| 17579/17579 [01:20<00:00, 217.55it/s]


epoch:1	mean(loss_log):0.693	test_score:0.637


100%|██████████| 17579/17579 [01:20<00:00, 218.74it/s]


epoch:2	mean(loss_log):0.672	test_score:0.633


100%|██████████| 17579/17579 [01:20<00:00, 217.23it/s]


epoch:3	mean(loss_log):0.658	test_score:0.629


100%|██████████| 17579/17579 [01:20<00:00, 218.28it/s]


epoch:4	mean(loss_log):0.644	test_score:0.624


## ニューラル行列因子分解

In [27]:
import csv
from sklearn.feature_extraction.text import CountVectorizer

with open("./data/ml-20m/movies.csv") as fp:
    reader = csv.DictReader(fp)
    def parse(d):
        movieId = int(d["movieId"])
        genres = d["genres"]
        return movieId, genres
    data = [parse(d) for d in reader]
    
movieIds = [x[0] for x in data]
genres = [x[1] for x in data]

cv = CountVectorizer(dtype="f4").fit(genres)
num_genres = len(cv.get_feature_names())

it = cv.transform(genres).toarray()
print(it)
it = (torch.tensor(g, dtype=torch.float32) for g in it)
genre_dict = dict(zip(movieIds, it))

[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [34]:
def first(xs):
    it = iter(xs)
    return next(it)

class MovieLensDataset(Dataset):
    def __init__(self, x, y, genres):
        assert len(x) == len(y)
        self.x = x
        self.y = y
        self.genres = genres
        self.null_genre = torch.zeros_like(first(genres.values()))
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        movieId = x[1]
        g = self.genres.get(movieId, self.null_genre)
        return x, y, g

In [36]:
train_dataset = MovieLensDataset(torch.tensor(train_X, dtype=torch.int64),
                                 torch.tensor(train_Y, dtype=torch.float32),
                                 genre_dict)
test_dataset = MovieLensDataset(torch.tensor(test_X, dtype=torch.int64),
                                 torch.tensor(test_Y, dtype=torch.float32),
                                 genre_dict)
train_loader = DataLoader(train_dataset, batch_size=1024,
                          shuffle=True, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=1024,
                          shuffle=True, num_workers=8)

In [46]:
class NeuralMatrixFactorization2(nn.Module):
    def __init__(self, max_user, max_item, num_genres,
                 user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k + num_genres, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x, g):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        out = torch.cat([user_feature, item_feature, g], 1)
        out = self.mlp(out)
        out = torch.sigmoid(out) * 5
        return out.squeeze()

In [47]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    net.eval()
    for x, y, g in loader:
        x = x.to(device)
        g = g.to(device)
        with torch.no_grad():
            ypred = net(x, g).to("cpu")
        ys.append(y)
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score

In [49]:
net = NeuralMatrixFactorization2(max_user+1, max_item+1, num_genres)
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()
net.to("cuda:0")

for epoch in range(5):
    loss_log = []
    net.train()
    for x, y, g in tqdm.tqdm(train_loader):
        x = x.cuda()
        y = y.cuda()
        g = g.cuda()
        o = net(x, g)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    test_score = eval_net(net, test_loader, device="cuda:0")
    print("epoch:{}\tmean(loss_log):{:.3f}\ttest_score:{:.3f}".format(
    epoch, mean(loss_log), test_score), flush=True)

100%|██████████| 17579/17579 [01:11<00:00, 246.08it/s]


epoch:0	mean(loss_log):0.751	test_score:0.643


100%|██████████| 17579/17579 [01:11<00:00, 245.79it/s]


epoch:1	mean(loss_log):0.688	test_score:0.629


100%|██████████| 17579/17579 [01:09<00:00, 251.31it/s]


epoch:2	mean(loss_log):0.666	test_score:0.623


100%|██████████| 17579/17579 [01:09<00:00, 253.18it/s]


epoch:3	mean(loss_log):0.650	test_score:0.621


100%|██████████| 17579/17579 [01:10<00:00, 250.94it/s]


epoch:4	mean(loss_log):0.632	test_score:0.615


In [56]:
def make_genre_vector(i, max_len):
    g = torch.zeros(max_len)
    g[i] = 1
    return g

query_genres = [make_genre_vector(i, num_genres) for i in range(num_genres)]
query_genres = torch.stack(query_genres, 1)

query = torch.stack([
    torch.empty(num_genres, dtype=torch.int64).fill_(100),
    torch.empty(num_genres, dtype=torch.int64).fill_(0)
], 1)
query_genres = query_genres.cuda()
query = query.cuda()
net(query, query_genres)

tensor([3.3540, 3.3512, 3.3538, 3.3534, 3.3515, 3.3520, 3.3532, 3.3533, 3.3529,
        3.3550, 3.3522, 3.3514, 3.3532, 3.3533, 3.3528, 3.3521, 3.3531, 3.3531,
        3.3502, 3.3525, 3.3519, 3.3545, 3.3542, 3.3525],
       device='cuda:0', grad_fn=<SqueezeBackward0>)