In [1]:
from google.colab import drive
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 데이터셋 준비
drive.mount('/content/drive')
dataset = pd.read_csv('/content/drive/Othercomputers/내 노트북/학교/MILAB/파이토치 스터디/rating.csv')
# dataset = dataset.sort_values(by=['timestamp'], ascending=True) # 타임스탬프를 기준으로 정렬
dataset = dataset.drop('timestamp', axis=1) # timestamp 는 나중에 빼자

m = dataset['movieId'].max() # 유저 수: 138493
n = dataset['userId'].max() # 영화 수: 131262
batch_size = 1024 # 배치 사이즈가 2의 제곱수면 효율적이라는 말을 들었던 것 같기도 하고...

user_tensor = torch.tensor(dataset['userId'].values, dtype=torch.int64)
movie_tensor = torch.tensor(dataset['movieId'].values, dtype=torch.int64)
rating_tensor = torch.tensor(dataset['rating'].values, dtype=torch.float32)

# Tensor를 하나의 텐서로 결합
dataset = torch.stack([user_tensor, movie_tensor, rating_tensor], dim=1)

Mounted at /content/drive


In [2]:
# 데이터셋 분할
class CustomDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data[idx][:2]
    label = self.data[idx][2]
    return sample, label


train_data = CustomDataset(dataset[:len(dataset)*3//5])
val_data = CustomDataset(dataset[len(dataset)*3//5:len(dataset)*4//5])
test_data = CustomDataset(dataset[len(dataset)*4//5:])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) # 셔플은 학습이 진행되는 train 에서만 해도 된다
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
print("train data:", len(train_data)) # 12000157
print("validation data:", len(val_data)) # 4000053
print("test data:", len(test_data)) # 4000053

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print()
print("Using device", device)

train data: 12000157
validation data: 4000053
test data: 4000053

Using device cuda


In [3]:
# 모델 정의
class model(nn.Module):
  def __init__(self, m, n, k):# m: 유저 수, n: 영화 수, k: 행렬 사이즈
    super(model, self).__init__()

    self.user_M = nn.Parameter(torch.randn(n+1, k))
    self.movie_M = nn.Parameter(torch.randn(m+1, k))


  def forward(self, x: torch.Tensor) -> torch.Tensor:
        i, j = x[:, 0].long(), x[:, 1].long()
        user_vectors = self.user_M[i]
        movie_vectors = self.movie_M[j]
        dot_products = torch.sum(user_vectors * movie_vectors, dim=1)
        return dot_products

# 모델 객체 생성
model = model(m, n, 128).to(device) # k 값은 일단 5으로
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Train
print("Train start")
max_epoch = 10
for epoch in range(max_epoch):
  model.train()
  total_loss = 0

  for data, label in train_loader:
    data = data.to(device)
    label = label.to(device)

    # 옵티마이저 초기화
    optimizer.zero_grad()

    # foward
    output = model(data)
    loss = torch.sum(torch.abs(output - label))/batch_size
    total_loss += loss

    # backpropagation
    loss.backward()
    optimizer.step()

  # Validation
  model.eval()
  val_loss = 0
  with torch.no_grad():
    for data, label in val_loader:
      data = data.to(device)
      label = label.to(device)
      output = model(data)
      loss = torch.sum(torch.abs(output - label))/batch_size
      val_loss += loss

  print(f"Epoch {epoch+1}/{max_epoch}, Loss: {total_loss/len(train_loader):.5f}, Validation Loss: {val_loss/len(val_loader):.5f}")



Train start
Epoch 1/10, Loss: 9.46560, Validation Loss: 9.44415
Epoch 2/10, Loss: 9.41591, Validation Loss: 9.39946
Epoch 3/10, Loss: 9.36647, Validation Loss: 9.35491
Epoch 4/10, Loss: 9.31722, Validation Loss: 9.31053
Epoch 5/10, Loss: 9.26819, Validation Loss: 9.26631
Epoch 6/10, Loss: 9.21943, Validation Loss: 9.22228
Epoch 7/10, Loss: 9.17088, Validation Loss: 9.17841
Epoch 8/10, Loss: 9.12259, Validation Loss: 9.13472
Epoch 9/10, Loss: 9.07454, Validation Loss: 9.09123
Epoch 10/10, Loss: 9.02676, Validation Loss: 9.04795


In [4]:
# Test
print("Test")
model.eval()
test_loss = 0
with torch.no_grad():
  for data, label in test_loader:
    data = data.to(device)
    label = label.to(device)
    output = model(data)
    loss = torch.sum(torch.abs(output - label))/1024
    test_loss += loss

print(f"Test Loss: {test_loss/len(test_loader):.5f}")



Test
Test Loss: 9.05944
