In [None]:
!pip uninstall -y numpy scikit-surprise
!pip install numpy==1.23.5
!pip install scikit-surprise --no-binary :all:

In [2]:
!pip install scikit-surprise --quiet surprise

Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 4, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/main.py", line 11, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/build_env.py", line 19, in <module>
^C


In [1]:
# ✅ Re-run these after restarting runtime
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

In [2]:
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -o ml-latest-small.zip

Archive:  ml-latest-small.zip
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")

In [4]:
print(f" Movies Shape: {movies.shape}")
print(f" Ratings Shape: {ratings.shape}")

 Movies Shape: (9742, 3)
 Ratings Shape: (100836, 4)


In [5]:
# Content-Based Filtering
# Merge genres into overview
movies['overview'] = movies['title'] + " " + movies['genres'].fillna('')

In [6]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['overview'])

In [7]:
# Cosine Similarity Matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
# Reverse mapping
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [9]:
# Content-Based Recommendation Function
def content_recommend(title, n=5):
    if title not in indices:
        return f"❌ Movie '{title}' not found in dataset."
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies[['title', 'genres']].iloc[movie_indices]

In [10]:
# Example
print("Content-Based Recommendations for 'Toy Story (1995)':")
print(content_recommend("Toy Story (1995)"))

Content-Based Recommendations for 'Toy Story (1995)':
                                      title  \
2355                     Toy Story 2 (1999)   
7355                     Toy Story 3 (2010)   
3595                        Toy, The (1982)   
2539  We're Back! A Dinosaur's Story (1993)   
26                      Now and Then (1995)   

                                                genres  
2355       Adventure|Animation|Children|Comedy|Fantasy  
7355  Adventure|Animation|Children|Comedy|Fantasy|IMAX  
3595                                            Comedy  
2539              Adventure|Animation|Children|Fantasy  
26                                      Children|Drama  


In [11]:
# Collaborative Filtering (SVD)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

model = SVD()
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8824  0.8790  0.8835  0.8816  0.0019  
MAE (testset)     0.6792  0.6763  0.6794  0.6783  0.0014  
Fit time          1.19    1.23    1.23    1.22    0.02    
Test time         0.26    0.18    0.32    0.26    0.06    


{'test_rmse': array([0.88235438, 0.87896238, 0.88350459]),
 'test_mae': array([0.6792032 , 0.67625394, 0.67937434]),
 'fit_time': (1.1942799091339111, 1.227741003036499, 1.2337591648101807),
 'test_time': (0.2633678913116455, 0.18350553512573242, 0.3222627639770508)}

In [12]:
# Predict function
def predict_rating(userId, movieId):
    return model.predict(userId, movieId).est

In [13]:
# Recommend top N movies for a user
def collaborative_recommend(user_id, n=5):
    rated = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    all_movies = ratings['movieId'].unique()
    not_rated = [m for m in all_movies if m not in rated]
    predictions = [(mid, predict_rating(user_id, mid)) for mid in not_rated]
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_ids = [pred[0] for pred in predictions[:n]]
    return movies[movies['movieId'].isin(top_ids)][['title', 'genres']]

In [14]:
# Example
print("\n Collaborative Recommendations for User 1:")
print(collaborative_recommend(1))


 Collaborative Recommendations for User 1:
                                 title                      genres
277   Shawshank Redemption, The (1994)                 Crime|Drama
686                 Rear Window (1954)            Mystery|Thriller
692            Some Like It Hot (1959)                Comedy|Crime
711                   Notorious (1946)  Film-Noir|Romance|Thriller
8466                   Whiplash (2014)                       Drama
