In [4]:
import pandas as pd
import numpy as np


movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")


print("Movies Dataset:")
print(movies.head())

print("\nRatings Dataset:")
print(ratings.head())


Movies Dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:


def top_popular_by_count(n=5):
    """Top n movies by number of ratings (popularity)."""
    counts = ratings.groupby('movieId').size().reset_index(name='rating_count')
    merged = counts.merge(movies, on='movieId')
    top = merged.sort_values('rating_count', ascending=False).head(n)
    return top[['movieId','title','genres','rating_count']]

def top_by_avg_rating(n=5, min_ratings=50):
    """Top n movies by average rating, but only movies with >= min_ratings."""
    agg = ratings.groupby('movieId').agg({'rating':['mean','count']})
    agg.columns = ['avg_rating','rating_count']
    agg = agg.reset_index()
    agg = agg[agg['rating_count'] >= min_ratings]
    merged = agg.merge(movies, on='movieId')
    top = merged.sort_values(['avg_rating','rating_count'], ascending=[False, False]).head(n)
    return top[['movieId','title','genres','avg_rating','rating_count']]

print("Top 5 by rating count (popularity):")
print(top_popular_by_count(5).to_string(index=False))

print("\nTop 5 by average rating (min 50 ratings):")
print(top_by_avg_rating(5, min_ratings=50).to_string(index=False))


Top 5 by rating count (popularity):
 movieId                            title                      genres  rating_count
     356              Forrest Gump (1994)    Comedy|Drama|Romance|War           329
     318 Shawshank Redemption, The (1994)                 Crime|Drama           317
     296              Pulp Fiction (1994) Comedy|Crime|Drama|Thriller           307
     593 Silence of the Lambs, The (1991)       Crime|Horror|Thriller           279
    2571               Matrix, The (1999)      Action|Sci-Fi|Thriller           278

Top 5 by average rating (min 50 ratings):
 movieId                                                                       title                      genres  avg_rating  rating_count
     318                                            Shawshank Redemption, The (1994)                 Crime|Drama    4.429022           317
     858                                                       Godfather, The (1972)                 Crime|Drama    4.289062           192


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


movies_cb = movies.copy()
movies_cb['genres'] = movies_cb['genres'].fillna('')
movies_cb['genres_literal'] = movies_cb['genres'].str.replace('|', ' ', regex=False)


vectorizer = CountVectorizer()
genre_matrix = vectorizer.fit_transform(movies_cb['genres_literal'])
genre_cosine_sim = cosine_similarity(genre_matrix, genre_matrix)


title_to_index = pd.Series(movies_cb.index, index=movies_cb['title']).drop_duplicates()

def get_content_recommendations(title, top_n=5):
    """Return top_n movies similar to the given title (by genres)."""
    if title not in title_to_index:
        return f"Title '{title}' not found in dataset."
    idx = int(title_to_index[title])
    sim_scores = list(enumerate(genre_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # skip the movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies_cb.iloc[movie_indices][['movieId','title','genres']]


seed = "Avengers, The (1998)"
   
print(f"Content-based recommendations for: {seed}")
print(get_content_recommendations(seed, top_n=5).to_string(index=False))


Content-based recommendations for: Avengers, The (1998)
 movieId                                                                          title           genres
     761                                                            Phantom, The (1996) Action|Adventure
     809                                                                    Fled (1996) Action|Adventure
    1049                                             Ghost and the Darkness, The (1996) Action|Adventure
    1198 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) Action|Adventure
    1291                                      Indiana Jones and the Last Crusade (1989) Action|Adventure


In [7]:
seed = "Urban Legend (1998)"
   
print(f"Content-based recommendations for: {seed}")
print(get_content_recommendations(seed, top_n=5).to_string(index=False))

Content-based recommendations for: Urban Legend (1998)
 movieId                                                                                  title          genres
     328                                     Tales from the Crypt Presents: Demon Knight (1995) Horror|Thriller
     407                                                         In the Mouth of Madness (1995) Horror|Thriller
     742                                                                         Thinner (1996) Horror|Thriller
     879                                                                      Relic, The (1997) Horror|Thriller
     891 Halloween: The Curse of Michael Myers (Halloween 6: The Curse of Michael Myers) (1995) Horror|Thriller


In [9]:
from sklearn.metrics.pairwise import cosine_similarity


ratings_pivot = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print("Ratings pivot shape (users x movies):", ratings_pivot.shape)


item_similarity = cosine_similarity(ratings_pivot.T)   
item_ids = list(ratings_pivot.columns)                


pos_to_movieid = {i: mid for i, mid in enumerate(item_ids)}
movieid_to_pos = {mid: i for i, mid in enumerate(item_ids)}

def get_cf_recommendations(user_id, top_n=5):
   
    if user_id not in ratings_pivot.index:
        return f"User {user_id} not found in ratings."
    user_ratings = ratings_pivot.loc[user_id].values  

    raw_scores = item_similarity.dot(user_ratings)     
   
    sim_sums = np.abs(item_similarity).sum(axis=1)
    with np.errstate(divide='ignore', invalid='ignore'):
        scores = raw_scores / sim_sums

    preds = pd.DataFrame({
        'movieId': [pos_to_movieid[i] for i in range(len(scores))],
        'pred_score': scores,
        'user_has_rated': user_ratings
    })
   
    unseen = preds[preds['user_has_rated'] == 0]
    top_unseen = unseen.sort_values('pred_score', ascending=False).head(top_n)
    
    recommendations = top_unseen.merge(movies, on='movieId')[['movieId','title','genres','pred_score']]
    return recommendations





Ratings pivot shape (users x movies): (610, 9724)


In [10]:
user_counts = ratings['userId'].value_counts()
sample_user = user_counts[user_counts >= 10].index[0] if any(user_counts >= 10) else ratings['userId'].iloc[0]
print(f"Collaborative (item-based) recommendations for user {sample_user}:")
print(get_cf_recommendations(sample_user, top_n=5).to_string(index=False))

Collaborative (item-based) recommendations for user 414:
 movieId                                      title         genres  pred_score
   27373                                 61* (2001)          Drama    3.378024
  193609        Andrew Dice Clay: Dice Rules (1991)         Comedy    3.354932
  141846  Steve Jobs: The Man in the Machine (2015)    Documentary    3.335774
    6480            Thoroughly Modern Millie (1967) Comedy|Musical    3.279013
   68536 Stanley Kubrick: A Life in Pictures (2001)    Documentary    3.273283


In [12]:
def print_rec_df(df, header="Recommendations"):
    if isinstance(df, str):
        print(df
        return
    print(f"\n--- {header} ---")
    for i, row in df.reset_index(drop=True).iterrows():
        title = row.get('title') or row.get('movieId')
        extra = []
        if 'pred_score' in row: extra.append(f"score={row['pred_score']:.3f}")
        if 'avg_rating' in row: extra.append(f"avg={row['avg_rating']:.2f}")
        if 'rating_count' in row: extra.append(f"votes={int(row['rating_count'])}")
        print(f"{i+1}. {title}  {' | '.join(extra)}")


In [23]:
print("----------------------------------------------------------")
print(top_popular_by_count(3).to_string(index=False))

print("\n----------------------------------------------------------")
print(top_by_avg_rating(3, min_ratings=50).to_string(index=False))


----------------------------------------------------------
 movieId                            title                      genres  rating_count
     356              Forrest Gump (1994)    Comedy|Drama|Romance|War           329
     318 Shawshank Redemption, The (1994)                 Crime|Drama           317
     296              Pulp Fiction (1994) Comedy|Crime|Drama|Thriller           307

----------------------------------------------------------
 movieId                            title                      genres  avg_rating  rating_count
     318 Shawshank Redemption, The (1994)                 Crime|Drama    4.429022           317
     858            Godfather, The (1972)                 Crime|Drama    4.289062           192
    2959                Fight Club (1999) Action|Crime|Drama|Thriller    4.272936           218


In [21]:
print("----------------------------------------------------------")
print(get_content_recommendations("Toy Story (1995)", 3).to_string(index=False))

print("\n----------------------------------------------------------")
print(get_content_recommendations("Pulp Fiction (1994)", 3).to_string(index=False))

print("\n----------------------------------------------------------")
print(get_content_recommendations("Lion King, The (1994)", 3).to_string(index=False))


----------------------------------------------------------
 movieId                                          title                                      genres
    2294                                    Antz (1998) Adventure|Animation|Children|Comedy|Fantasy
    3114                             Toy Story 2 (1999) Adventure|Animation|Children|Comedy|Fantasy
    3754 Adventures of Rocky and Bullwinkle, The (2000) Adventure|Animation|Children|Comedy|Fantasy

----------------------------------------------------------
 movieId                                                 title                      genres
     608                                          Fargo (1996) Comedy|Crime|Drama|Thriller
    1034                                        Freeway (1996) Comedy|Crime|Drama|Thriller
    3266 Man Bites Dog (C'est arrivé près de chez vous) (1992) Comedy|Crime|Drama|Thriller

----------------------------------------------------------
 movieId                                                 

In [22]:
print("-----------------------------")
print(get_cf_recommendations(1, top_n=3).to_string(index=False))

print("\n-------------------------------")
print(get_cf_recommendations(50, top_n=3).to_string(index=False))

print("\n-----------------------------")
print(get_cf_recommendations(200, top_n=3).to_string(index=False))


-----------------------------
 movieId                                          title                       genres  pred_score
     876 Supercop 2 (Project S) (Chao ji ji hua) (1993) Action|Comedy|Crime|Thriller    1.356194
    6145                                   Venom (1982)              Horror|Thriller    1.323607
    2812                             In Too Deep (1999)              Action|Thriller    1.280674

-------------------------------
 movieId                                     title                     genres  pred_score
  131739                   Batman vs. Robin (2015) Action|Adventure|Animation    0.865116
  141846 Steve Jobs: The Man in the Machine (2015)                Documentary    0.839585
    6145                              Venom (1982)            Horror|Thriller    0.785248

-----------------------------
 movieId                                title        genres  pred_score
   55854                 Fugitive, The (1947)         Drama    1.922002
    8626 Dr. T