In [18]:
import pandas as pd
from sklearn import model_selection
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, 
                              DataLoader,
                              TensorDataset)
import tqdm
import csv
from sklearn.feature_extraction.text import CountVectorizer


# csv.DictReader를 사용해서 CSV 파일 읽기
# 필요한 부분만 추출
with open("ml-20m/movies.csv") as fp:
    reader = csv.DictReader(fp)
    def parse(d):
        movieId = int(d["movieId"])
        genres = d["genres"]
        return movieId, genres
    data = [parse(d) for d in reader]
  
movieIds = [x[0] for x in data]
genres = [x[1] for x in data]

# 데이터에 맞추어 CountVectorizer를 훈련
# Bag of Words 형태로 만들기
cv = CountVectorizer(dtype="f4").fit(genres)
num_genres = len(cv.get_feature_names())

# key가 movieId이고 value가 BoW인 Tensor의 dict 만들기
it = cv.transform(genres).toarray()
it = (torch.tensor(g, dtype=torch.float32) for g in it)
genre_dict = dict(zip(movieIds, it))



In [19]:
df = pd.read_csv("ml-20m/ratings.csv")
# X는 (userId, movieId) 쌍
X = df[["userId", "movieId"]].values
Y = df[["rating"]].values

# 훈련 데이터와 테스트 데이터를 9대 1로 분할
train_X, test_X, train_Y, test_Y\
    = model_selection.train_test_split(X, Y, test_size=0.1)

# X는 ID이고 정수이므로 int64, Y는 실수이므로 float32의 Tensor로 변환
train_dataset = TensorDataset(
    torch.tensor(train_X, dtype=torch.int64), torch.tensor(train_Y, dtype=torch.float32))
test_dataset = TensorDataset(
    torch.tensor(test_X, dtype=torch.int64), torch.tensor(test_Y, dtype=torch.float32))
train_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4, shuffle=True)
test_loader = DataLoader(
    test_dataset, batch_size=1024, num_workers=4)

In [20]:
train_X

array([[ 88817,   4540],
       [ 69935,   2700],
       [103027,   3668],
       ...,
       [ 98231,   1278],
       [138348,    529],
       [  5209,    838]])

In [25]:
class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        # user_feature*item_feature는 (batch_size,k) 차원이므로
        # k의 sum을 구하면 각 샘플의 내적이 된다
        out = torch.sum(user_feature * item_feature, 1)
        
        # [0, 5] 범위 내로 조정
        out = nn.functional.sigmoid(out) * 5
        return out

In [26]:
max_user, max_item = X.max(0)
# np.int64형을 파이썬의 표준 int로 캐스트
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user+1, max_item+1)

In [None]:
def first(xs):
    it = iter(xs)
    return next(it)

class MovieLensDataset(Dataset):
    def __init__(self, x, y, genres):
        assert len(x) == len(y)
        self.x = x
        self.y = y
        self.genres = genres
        
        # 장르 사전에 없는 movieId를 위한 더미 데이터
        self.null_genre = torch.zeros_like(
            first(genres.values()))
        
    def __len__(self):
        return len(self.x)
        
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        # x = (userId, movieId)
        movieId = x[1]
        g = self.genres.get(movieId, self.null_genre)
        return x, y, g

In [21]:
train_dataset = MovieLensDataset(
    torch.tensor(train_X, dtype=torch.int64),
    torch.tensor(train_Y, dtype=torch.float32), 
    genre_dict)
test_dataset = MovieLensDataset(
    torch.tensor(test_X, dtype=torch.int64),
    torch.tensor(test_Y, dtype=torch.float32),
    genre_dict)
train_loader = DataLoader(
    train_dataset, batch_size=1024, shuffle=True, 
num_workers=4)
test_loader = DataLoader(
    test_dataset, batch_size=1024, num_workers=4)

In [22]:
class NeuralMatrixFactorization2(nn.Module):
    def __init__(self, max_user, max_item, num_genres,
                 user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            # num_genres분만큼 차원이 늘어난다
            nn.Linear(user_k + item_k + num_genres, 
hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x, g):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        # 장르 BoW를 cat로 특이 벡터에 결합한다
        out = torch.cat([user_feature, item_feature, g], 1)
        out = self.mlp(out)
        out = nn.functional.sigmoid(out) * 5
        return out.squeeze()

In [23]:
def eval_net(net, loader, score_fn=nn.functional.
l1_loss, device="cpu"):
    ys = []
    ypreds = []
    # loader는 장르 BoW도 반환
    for x, y, g in loader:
        x = x.to(device)
        g = g.to(device)
        ys.append(y)
        # userId, movieId 외에 장르 BoW
        # 도 신경망 함수에 전달
        with torch.no_grad():
         ypred = net(x, g).to("cpu")
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), 
torch.cat(ypreds))
    return score

In [27]:
from statistics import mean

net = NeuralMatrixFactorization2(
    max_user+1, max_item+1, num_genres)
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()
net.to("cuda:0")
for epoch in range(5):
    loss_log = []
    net.train()
    for x, y, g in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        g = g.to("cuda:0")
        o = net(x, g)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    net.eval()
    test_score = eval_net(net, test_loader, 
device="cuda:0")
    print(epoch, mean(loss_log), test_score.item(), 
flush=True)

100%|██████████| 17579/17579 [02:23<00:00, 122.13it/s]


0 0.7549036730169212 0.6439146399497986


100%|██████████| 17579/17579 [02:22<00:00, 123.72it/s]


1 0.6906584421212685 0.6309435367584229


100%|██████████| 17579/17579 [02:23<00:00, 122.55it/s]


2 0.6704502047155964 0.6268938779830933


100%|██████████| 17579/17579 [02:18<00:00, 126.61it/s]


3 0.659086318544726 0.6249074339866638


100%|██████████| 17579/17579 [02:21<00:00, 123.87it/s]


4 0.6447439897324189 0.6247057914733887


In [28]:
# 지정한 위치만 1이고 나머지가 0인 Tensor 반환
def make_genre_vector(i, max_len):
    g = torch.zeros(max_len)
    g[i] = 1
    return g
  
query_genres = [make_genre_vector(i, num_genres)
    for i in range(num_genres)]
query_genres = torch.stack(query_genres, 1)

# num_genres분만큼 userId=100과 movieId=0의 Tensor를 만들어 결합
query = torch.stack([
    torch.empty(num_genres, dtype=torch.int64).fill_(100),
    torch.empty(num_genres, dtype=torch.int64).fill_(0)
], 1)

# GPU로 전송
query_genres = query_genres.to("cuda:0")
query = query.to("cuda:0")

# 점수 계산
net(query, query_genres)

tensor([3.0603, 3.0593, 3.0590, 3.0606, 3.0591, 3.0592, 3.0601, 3.0583, 3.0588,
        3.0582, 3.0614, 3.0591, 3.0601, 3.0587, 3.0614, 3.0564, 3.0589, 3.0587,
        3.0602, 3.0600, 3.0601, 3.0582, 3.0606, 3.0609], device='cuda:0',
       grad_fn=<SqueezeBackward0>)