In [5]:
!pip install surprise numpy pandas

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357275 sha256=58f707c14970cd77f8bbbccc04e9a23a95f4143c07b8e249bd02fc0dc931f60b
  Stored in directory: /root/.cach

## Simple Collaborative Filtering

In [34]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

# Load MovieLens 100K dataset
url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
col_names = ['userId', 'movieId', 'rating', 'timestamp']
data = pd.read_csv(url, sep='\t', names=col_names, header=None)

# Load MovieLens movie titles
movies_url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
movies_columns = [
    'movieId', 'movieTitle', 'releaseDate', 'videoReleaseDate', 'IMDbURL',
    'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror',
    'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western'
]
movies = pd.read_csv(movies_url, sep='|', names=movies_columns, encoding='latin-1')

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Load the data into Surprise's Dataset format
dataset = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(dataset, test_size=0.2)

# Initialize the SVD algorithm
svd = SVD()

# Train the model
svd.fit(trainset)

# Make predictions on the test set
predictions = svd.test(testset)

# Evaluate the model using RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.4f}")

# Get top N recommendations for each user
def get_top_n_recommendations(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n_recommendations(predictions, n=10)

# Map movie ID to movie titles
movie_titles = {row['movieId']: row['movieTitle'] for _, row in movies.iterrows()}

# Display top 10 recommendations for user 1 with movie titles
print("Top 10 recommendations for user 1 with titles:")
for movie_id, rating in top_n[1]:
    movie_title = movie_titles.get(movie_id, "Unknown Movie")
    print(f"{movie_title}: Predicted Rating = {rating:.2f}")


RMSE: 0.9297
RMSE: 0.9297
Top 10 recommendations for user 1 with titles:
Usual Suspects, The (1995): Predicted Rating = 4.78
Pulp Fiction (1994): Predicted Rating = 4.68
Wrong Trousers, The (1993): Predicted Rating = 4.61
Mr. Smith Goes to Washington (1939): Predicted Rating = 4.35
Postino, Il (1994): Predicted Rating = 4.29
Raising Arizona (1987): Predicted Rating = 4.27
Graduate, The (1967): Predicted Rating = 4.26
Bound (1996): Predicted Rating = 4.25
Three Colors: Red (1994): Predicted Rating = 4.22
Jean de Florette (1986): Predicted Rating = 4.18


## Simple Content Based Filtering

In [33]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load movie data (movieId, movieTitle, genres)
url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
col_names = ['movieId', 'movieTitle', 'releaseDate', 'videoReleaseDate', 'IMDbURL',
             'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
             'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror',
             'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western']

# Load data
movie_data = pd.read_csv(url, sep='|', encoding='latin-1', names=col_names, header=None)

# Keep only relevant columns (movieId, movieTitle, and genres)
movie_data = movie_data[['movieId', 'movieTitle'] + col_names[5:]]  # Columns from 'unknown' to 'Western'

# Display the first few rows
print(movie_data.head())

# Now, create a movie profile using the genre columns
# We will use the genres as features (binary values indicating genre presence)

# We already have binary genre columns, so we can use these as is for similarity calculation

# Create a genre matrix (Movies x Genres)
genre_matrix = movie_data.drop(columns=['movieId', 'movieTitle'])

# 3. Compute Cosine Similarity between movies
cosine_sim = cosine_similarity(genre_matrix)

# We now have a matrix where each element (i, j) represents the similarity between movie i and movie j

# 4. Recommend similar movies

def get_similar_movies(movie_title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = movie_data.index[movie_data['movieTitle'] == movie_title].tolist()[0]

    # Get pairwise similarity scores for the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity scores (highest to lowest)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:11]  # Top 10 recommendations, excluding itself
    movie_indices = [i[0] for i in sim_scores]

    # Get the movie titles of the most similar movies
    recommended_movies = movie_data['movieTitle'].iloc[movie_indices]

    return recommended_movies

# Example: Get movies similar to "Toy Story (1995)"
recommended_movies = get_similar_movies('Toy Story (1995)')
print("Movies similar to Toy Story (1995):")
for movie in recommended_movies:
    print(movie)


   movieId         movieTitle  unknown  Action  Adventure  Animation  \
0        1   Toy Story (1995)        0       0          0          1   
1        2   GoldenEye (1995)        0       1          1          0   
2        3  Four Rooms (1995)        0       0          0          0   
3        4  Get Shorty (1995)        0       1          0          0   
4        5     Copycat (1995)        0       0          0          0   

   Children  Comedy  Crime  Documentary  ...  Fantasy  FilmNoir  Horror  \
0         1       1      0            0  ...        0         0       0   
1         0       0      0            0  ...        0         0       0   
2         0       0      0            0  ...        0         0       0   
3         0       1      0            0  ...        0         0       0   
4         0       0      1            0  ...        0         0       0   

   Musical  Mystery  Romance  SciFi  Thriller  War  Western  
0        0        0        0      0         0    0    