In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

user_ids = ratings["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

movie_ids = ratings["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

ratings["user"] = ratings["userId"].map(user2user_encoded)
ratings["movie"] = ratings["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
ratings["rating"] = ratings["rating"].values.astype(np.float32)

X = ratings[["user", "movie"]].values
y = ratings["rating"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

train_dataset = MovieDataset(X_train[:,0], X_train[:,1], y_train)
val_dataset = MovieDataset(X_val[:,0], X_val[:,1], y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_size)
        self.movie_embedding = nn.Embedding(num_embeddings=num_movies, embedding_dim=embedding_size)
        self.fc1 = nn.Linear(in_features=embedding_size*2, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=32)
        self.dropout = nn.Dropout(0.2)
        self.fc4 = nn.Linear(in_features=32, out_features=1)
        # типикал архитектура, в принципе можно ещё добавить сверточных слоёв но в принципе и так работает неплохо

    def forward(self, user, movie):
        user_embedded = self.user_embedding(user)
        movie_embedded = self.movie_embedding(movie)
        vector = torch.cat([user_embedded, movie_embedded], dim=-1)
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        vector = self.dropout(vector)
        vector = nn.ReLU()(self.fc3(vector))
        pred = nn.Sigmoid()(self.fc4(vector))
        return pred * 5.0

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = RecommenderNet(num_users, num_movies, 50).to(device)
criterion = nn.L1Loss()
# не особо интересны выбросы, будем меньше на них обращать внимания и поставим mae, а не mse
optimizer = Adam(model.parameters(), lr=0.001)

def train(model, criterion, optimizer, train_loader, val_loader, epochs):
    for epoch in range(epochs):
        model.train()
        train_losses = []
        for user, movie, rating in train_loader:
            user = user.to(device)
            movie = movie.to(device)
            rating = rating.to(device)
            
            optimizer.zero_grad()

            predictions = model(user, movie)
            loss = criterion(predictions, rating.unsqueeze(1))

            loss.backward()
            optimizer.step()
            
            train_losses.append(loss.item())
            
        model.eval()
        val_losses = []
        with torch.no_grad():
            for user, movie, rating in val_loader:
                user = user.to(device)
                movie = movie.to(device)
                rating = rating.to(device)
                predictions = model(user, movie)
                loss = criterion(predictions, rating.unsqueeze(1))
                val_losses.append(loss.item())
                
        print(f'Epoch {epoch+1}, Train Loss: {np.mean(train_losses):.4f}, Val Loss: {np.mean(val_losses):.4f}')

train(model, criterion, optimizer, train_loader, val_loader, epochs=5)

Epoch 1, Train Loss: 0.7554, Val Loss: 0.7237
Epoch 2, Train Loss: 0.6978, Val Loss: 0.7053
Epoch 3, Train Loss: 0.6644, Val Loss: 0.6924
Epoch 4, Train Loss: 0.6410, Val Loss: 0.6917
Epoch 5, Train Loss: 0.6198, Val Loss: 0.6840
Epoch 6, Train Loss: 0.6032, Val Loss: 0.6855
Epoch 7, Train Loss: 0.5883, Val Loss: 0.6869
Epoch 8, Train Loss: 0.5738, Val Loss: 0.6910
Epoch 9, Train Loss: 0.5609, Val Loss: 0.6908
Epoch 10, Train Loss: 0.5487, Val Loss: 0.6940


In [10]:
user_embeddings = model.user_embedding.weight.detach().cpu().numpy()
movie_embeddings = model.movie_embedding.weight.detach().cpu().numpy()

In [None]:
#можно сделать так, можно взять прокинуть через сетку обхект и получить 
# на выходе тоже эмбеддинг фльма/пользователя (bert style)