In [1]:

import torch
from torch import nn, optim
from torch.utils.data import (Dataset, 
                              DataLoader,
                              TensorDataset)
import tqdm

In [2]:
import pandas as pd
# 훈련 데이터와 테스트 데이터를 나누기 위해 사용한다
from sklearn import model_selection

df = pd.read_csv("ml-20m/ratings.csv")
# X는 (userId, movieId) 쌍
X = df[["userId", "movieId"]].values
Y = df[["rating"]].values

# 훈련 데이터와 테스트 데이터를 9대 1로 분할
train_X, test_X, train_Y, test_Y\
    = model_selection.train_test_split(X, Y, test_size=0.1)

# X는 ID이고 정수이므로 int64, Y는 실수이므로 float32의 Tensor로 변환
train_dataset = TensorDataset(
    torch.tensor(train_X, dtype=torch.int64), torch.tensor(train_Y, dtype=torch.float32))
test_dataset = TensorDataset(
    torch.tensor(test_X, dtype=torch.int64), torch.tensor(test_Y, dtype=torch.float32))
train_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4, shuffle=True)
test_loader = DataLoader(
    test_dataset, batch_size=1024, num_workers=4)

### 파이토치에서 행렬 인수분해

#### 행렬 인수분해

In [3]:
class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        # user_feature*item_feature는 (batch_size,k) 차원이므로
        # k의 sum을 구하면 각 샘플의 내적이 된다
        out = torch.sum(user_feature * item_feature, 1)
        
        # [0, 5] 범위 내로 조정
        out = nn.functional.sigmoid(out) * 5
        return out

In [4]:

max_user, max_item = X.max(0)
# np.int64형을 파이썬의 표준 int로 캐스트
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user+1, max_item+1)

In [5]:
def eval_net(net, loader, score_fn=nn.functional.
l1_loss, device="cpu"):
    ys = []
    ypreds = []
    for x, y in loader:
        x = x.to(device)
        ys.append(y)
        with torch.no_grad():
         ypred = net(x).to("cpu").view(-1)
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), 
torch.cat(ypreds))
    return score.item()

In [6]:
from statistics import mean

net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log), test_score, flush=True)

100%|██████████| 17579/17579 [02:52<00:00, 101.87it/s]


0 1.613379878153932 0.7354139685630798


100%|██████████| 17579/17579 [02:51<00:00, 102.26it/s]


1 0.8829837778409503 0.7098354697227478


100%|██████████| 17579/17579 [02:51<00:00, 102.28it/s]


2 0.8357044201240624 0.7001104354858398


100%|██████████| 17579/17579 [02:51<00:00, 102.36it/s]


3 0.8135738665892529 0.6960717439651489


100%|██████████| 17579/17579 [02:52<00:00, 102.17it/s]


4 0.8018317886929821 0.6947773694992065


In [15]:
torch.save(net.state_dict(), '../model.pt')