<a href="https://colab.research.google.com/github/warrengmartin/Which-Movie-For-Me/blob/main/Movie_Review_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import zipfile
import io
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.impute import KNNImputer

# URLs for the MovieLens dataset
ratings_url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Download the dataset
response = requests.get(ratings_url)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

# Extract the ratings.csv and movies.csv files
zip_file.extract('ml-latest-small/ratings.csv', '.')
zip_file.extract('ml-latest-small/movies.csv', '.')

# Rename the extracted files to remove the directory prefix
os.rename('ml-latest-small/ratings.csv', 'ratings.csv')
os.rename('ml-latest-small/movies.csv', 'movies.csv')

print("Files downloaded and extracted successfully.")

Files downloaded and extracted successfully.


In [None]:


# Load the MovieLens dataset
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

# Merge ratings and movies on movieId
data = pd.merge(ratings, movies, on='movieId')

# Create a user-item interaction matrix
user_item_matrix = data.pivot_table(index='userId', columns='movieId', values='rating')

# # Fill NaN values with 0
# user_item_matrix = user_item_matrix.fillna(0)


imputer = KNNImputer(n_neighbors=2)
user_item_matrix_imputed = imputer.fit_transform(user_item_matrix)  # Store imputed data in a new variable

# Convert the imputed NumPy array back to a Pandas DataFrame
user_item_matrix = pd.DataFrame(user_item_matrix_imputed,
                                 index=user_item_matrix.index,  # Use original DataFrame's index
                                 columns=user_item_matrix.columns) # Use original DataFrame's columns

In [None]:


# Normalize the ratings (changed to min-max normalization)
user_item_matrix_norm = (user_item_matrix - user_item_matrix.min().min()) / (user_item_matrix.max().max() - user_item_matrix.min().min())

# Convert to a PyTorch tensor
user_item_tensor = torch.tensor(user_item_matrix_norm.values, dtype=torch.float)

# Define the dataset class
class MovieLensDataset(Dataset):
    def __init__(self, user_item_tensor):
        self.user_item_tensor = user_item_tensor

    def __len__(self):
        return self.user_item_tensor.shape[0]

    def __getitem__(self, idx):
        return self.user_item_tensor[idx]

# Create the dataset and dataloader
dataset = MovieLensDataset(user_item_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model
class CollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(CollaborativeFiltering, self).__init__()
        self.user_embedding = nn.Embedding(num_users + 1, embedding_dim)  # +1 for new users
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc = nn.Linear(embedding_dim, 1)

        # Initialize embeddings
        nn.init.normal_(self.user_embedding.weight, mean=0, std=0.01)
        nn.init.normal_(self.item_embedding.weight, mean=0, std=0.01)

    def forward(self, user_indices, item_indices):
        user_embed = self.user_embedding(user_indices)
        item_embed = self.item_embedding(item_indices)
        x = torch.mul(user_embed, item_embed)
        x = torch.sum(x, dim=-1)
        return torch.sigmoid(x)  # Apply sigmoid to bound output between 0 and 1

    def predict_new_user(self, new_user_vector, item_indices):
        new_user_embed = torch.matmul(new_user_vector, self.item_embedding.weight)
        item_embed = self.item_embedding(item_indices)
        x = torch.mul(new_user_embed, item_embed)
        x = torch.sum(x, dim=-1)
        return torch.sigmoid(x)

# Initialize the model, loss function, and optimizer
num_users = user_item_matrix.shape[0]
num_items = user_item_matrix.shape[1]
embedding_dim = 50
model = CollaborativeFiltering(num_users, num_items, embedding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Reduced learning rate

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch_size = batch.shape[0]
        num_items = batch.shape[1]

        # Create user and item indices for the batch
        user_indices = torch.arange(batch_size).unsqueeze(1).expand(-1, num_items)
        item_indices = torch.arange(num_items).unsqueeze(0).expand(batch_size, -1)

        ratings = batch

        optimizer.zero_grad()
        predictions = model(user_indices, item_indices)
        loss = criterion(predictions, ratings)

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}')

# Evaluation
model.eval()
with torch.no_grad():
    user_indices = torch.arange(user_item_matrix.shape[0]).unsqueeze(1).expand(-1, user_item_matrix.shape[1])
    item_indices = torch.arange(user_item_matrix.shape[1]).unsqueeze(0).expand(user_item_matrix.shape[0], -1)
    predictions = model(user_indices, item_indices)
    mse = criterion(predictions, user_item_tensor)
    print(f'Validation MSE: {mse.item()}')

torch.save(model.state_dict(), 'collaborative_filtering_model.pth')
print("Model saved successfully.")

Epoch 1/3, Loss: 0.03833123194053769
Epoch 2/3, Loss: 0.009790681838057935
Epoch 3/3, Loss: 0.007615978247486055
Validation MSE: 0.05733146145939827
Model saved successfully.


In [None]:
def load_model_and_predict(new_user_ratings, user_item_matrix, model_path):
    # Load the model
    model = CollaborativeFiltering(user_item_matrix.shape[0], user_item_matrix.shape[1], embedding_dim)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    # Prepare the new user's ratings
    new_user_vector = pd.Series(index=user_item_matrix.columns, dtype=float)
    new_user_vector.loc[new_user_ratings.index] = new_user_ratings['rating']
    new_user_vector = new_user_vector.fillna(0)

    # Normalize the new user's ratings
    new_user_vector_norm = (new_user_vector - user_item_matrix.min().min()) / (user_item_matrix.max().max() - user_item_matrix.min().min())

    # Convert to tensor
    new_user_tensor = torch.tensor(new_user_vector_norm.values, dtype=torch.float).unsqueeze(0)

    # Make predictions
    with torch.no_grad():
        item_indices = torch.arange(user_item_matrix.shape[1])
        predictions = model.predict_new_user(new_user_tensor, item_indices)

    # Convert predictions to DataFrame
    predicted_ratings = pd.Series(predictions.squeeze().numpy(), index=user_item_matrix.columns)

    return predicted_ratings

In [None]:
# # Generate new user ratings (at least 5 ratings)
np.random.seed(123)  # for reproducibility
num_ratings = 5000  # You can change this to any number >= 5
random_movies = np.random.choice(user_item_matrix.columns, num_ratings, replace=False)
new_user_ratings = pd.DataFrame({
    'movieId': random_movies,
    'rating': np.random.randint(0, 4, size=num_ratings)  # Ratings from 1 to 5
}).set_index('movieId')


print("New user ratings:")
print(new_user_ratings.head())

# Load the model and make predictions
predicted_ratings = load_model_and_predict(new_user_ratings, user_item_matrix, 'collaborative_filtering_model.pth')

# Sort and display top 10 recommended movies
top_10_recommendations = predicted_ratings.sort_values(ascending=False).head(5)
recommended_movies = movies[movies['movieId'].isin(top_10_recommendations.index)]

print("\nTop 5 movie recommendations for the new user:")
print(recommended_movies[['title', 'genres']])

New user ratings:
         rating
movieId        
31692         1
4814          0
88272         2
47721         0
6347          3

Top 5 movie recommendations for the new user:
                                      title  \
0                          Toy Story (1995)   
6020              Darwin's Nightmare (2004)   
6022    No Direction Home: Bob Dylan (2005)   
6023  Goal! The Dream Begins (Goal!) (2005)   
6026        Squid and the Whale, The (2005)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
6020                                  Documentary  
6022                                  Documentary  
6023                                        Drama  
6026                                 Comedy|Drama  
