In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from collections import defaultdict

# Read data
ratings_df = pd.read_csv('ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
movies_df = pd.read_csv('movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genres'], engine='python', encoding='latin1')
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|'))

# Preprocessing
unique_genres = set()
for genres in movies_df['genres']:
    unique_genres.update(genres)

genre2id = {genre: idx for idx, genre in enumerate(unique_genres)}

adj_list = defaultdict(set)
for _, row in movies_df.iterrows():
    movie_id = row['movie_id']
    for genre in row['genres']:
        genre_id = genre2id[genre]
        adj_list[movie_id].add(len(movies_df) + genre_id)
        adj_list[len(movies_df) + genre_id].add(movie_id)

for key in adj_list:
    adj_list[key] = list(map(int, adj_list[key]))

# Define model
class SimpleKGAT(nn.Module):
    def __init__(self, num_users, num_items, num_genres, embedding_dim):
        super(SimpleKGAT, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings = nn.Embedding(num_items, embedding_dim)
        
        self.user_attention = nn.Linear(embedding_dim, 1)
        self.item_attention = nn.Linear(embedding_dim, 1)

    def forward(self, users, items, adj_list):
        user_embeds = self.user_embeddings(users)
        item_embeds = self.item_embeddings(items)
        
        batch_size = len(items)
        
        aggregated_item_embeds = []
        for i in range(batch_size):
            neighbors = adj_list[items[i].item()]
            item_neighbors = torch.tensor(list(neighbors), dtype=torch.long).to(user_embeds.device)
            item_neighbor_embeds = self.item_embeddings(item_neighbors)
            attention = torch.softmax(torch.matmul(user_embeds[i].unsqueeze(0), item_neighbor_embeds.t()), dim=-1)
            aggregated_item_embed = torch.matmul(attention, item_neighbor_embeds)
            aggregated_item_embeds.append(aggregated_item_embed)
        
        aggregated_item_embeds = torch.cat(aggregated_item_embeds)
        preds = torch.sigmoid(torch.sum(user_embeds * aggregated_item_embeds, dim=-1))
        return preds

embedding_dim = 64
num_users = ratings_df['user_id'].max() + 1
num_items = len(movies_df) + len(adj_list)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = []

for train_index, test_index in kf.split(ratings_df):
    train_df = ratings_df.iloc[train_index]
    test_df = ratings_df.iloc[test_index]

    user_ids = torch.tensor(train_df['user_id'].values, dtype=torch.long)
    item_ids = torch.tensor(train_df['movie_id'].values, dtype=torch.long)
    ratings = torch.tensor(train_df['rating'].values, dtype=torch.float)

    test_user_ids = torch.tensor(test_df['user_id'].values, dtype=torch.long)
    test_item_ids = torch.tensor(test_df['movie_id'].values, dtype=torch.long)
    test_ratings = torch.tensor(test_df['rating'].values, dtype=torch.float)

    model = SimpleKGAT(num_users, num_items, len(genre2id), embedding_dim)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.MSELoss()

    num_epochs = 10
    batch_size = 256

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for i in range(0, len(user_ids), batch_size):
           
            batch_user_ids = user_ids[i:i+batch_size].to(device)
            batch_item_ids = item_ids[i:i+batch_size].to(device)
            batch_ratings = ratings[i:i+batch_size].to(device)

            optimizer.zero_grad()
            preds = model(batch_user_ids, batch_item_ids, adj_list) * 5
            loss = criterion(preds, batch_ratings)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(user_ids)}")

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        test_preds = model(test_user_ids.to(device), test_item_ids.to(device), adj_list) * 5
        mse = mean_squared_error(test_preds.cpu().numpy(), test_ratings.cpu().numpy())
        mse_scores.append(mse)
        print(f'MSE for this fold: {mse}')

print(f'Average MSE for {kf.get_n_splits()} folds: {np.mean(mse_scores)}')


FileNotFoundError: [Errno 2] No such file or directory: 'ratings.dat'