In [5]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error,mean_absolute_error
from scipy.sparse import csr_matrix
from math import sqrt

from sympy.codegen.ast import continue_

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth',None)



In [6]:

movies=pd.read_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/processed/movies_cleaned.csv')
ratings=pd.read_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/processed/ratings_cleaned.csv')

train_ratings=pd.read_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/splits/train_ratings.csv')
test_ratings=pd.read_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/splits/test_ratings.csv')

print(f"movies : {len(movies) : }")
print(f"total ratings : {len(ratings)}")
print(f"Train ratings: {len(train_ratings):,}")
print(f"Test ratings: {len(test_ratings):,}")
print(f"Users: {ratings['userId'].nunique():,}")
print(f"Movies rated: {ratings['movieId'].nunique():,}")

# Preview
print("\nSample ratings:")
display(ratings.head(10))


movies :  9391
total ratings : 2487172
Train ratings: 1,989,737
Test ratings: 497,435
Users: 17,011
Movies rated: 9,391

Sample ratings:


Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,0,8050,70,1.0,1996-02-12 14:05:35
1,1,8050,21,3.0,1996-02-12 14:05:36
2,2,8050,10,3.0,1996-02-12 14:05:37
3,3,8050,1,5.0,1996-02-12 14:05:37
4,4,8050,32,5.0,1996-02-12 14:05:38
5,5,8050,50,4.0,1996-02-12 14:05:38
6,6,8050,76,3.0,1996-02-12 14:05:39
7,7,8050,62,4.0,1996-02-12 14:05:39
8,8,8050,16,4.0,1996-02-12 14:05:40
9,9,8050,14,3.0,1996-02-12 14:05:41


In [7]:
print("COLLABORATIVE FILTERING CONCEPT:")
print("=" * 70)
print("Content-Based: 'If you liked Action movies, here are more Action movies'")
print("Collaborative: 'Users like you also enjoyed these movies'")
print()
print("Two approaches:")
print("  1. User-Based: Find similar users, recommend what they liked")
print("  2. Item-Based: Find similar movies based on rating patterns")
print()
print("Key difference from content-based:")
print("  - Doesn't use movie features (genres, actors, etc.)")
print("  - Uses only user rating patterns")
print("  - Can recommend across different genres!")
print("=" * 70)

COLLABORATIVE FILTERING CONCEPT:
Content-Based: 'If you liked Action movies, here are more Action movies'
Collaborative: 'Users like you also enjoyed these movies'

Two approaches:
  1. User-Based: Find similar users, recommend what they liked
  2. Item-Based: Find similar movies based on rating patterns

Key difference from content-based:
  - Doesn't use movie features (genres, actors, etc.)
  - Uses only user rating patterns
  - Can recommend across different genres!


In [8]:
print("Creating user item matrix")
user_item_matrix=train_ratings.pivot_table(index='userId',columns='movieId',values='rating')
print(f"\nUser item matrix shape : {user_item_matrix.shape}")
print(f"user : {user_item_matrix.shape[0]}")
print(f"movies : {user_item_matrix.shape[1]}")

total_cells=user_item_matrix.shape[0]*user_item_matrix.shape[1]
filled_cells=user_item_matrix.count().sum()
sparsity=(1-filled_cells/total_cells)*100

print("\n Matrics statistics")
print(f" total possible rating : {total_cells}")
print(f" Actual rating {filled_cells}")
print(f"sparsity : {sparsity:.2f}")
print(f"density : {100-sparsity:.2f}%")

print("\n first 5 user and movies : ")
display(user_item_matrix.iloc[:10,:10])


Creating user item matrix

User item matrix shape : (17011, 9391)
user : 17011
movies : 9391

 Matrics statistics
 total possible rating : 159750301
 Actual rating 1989737
sparsity : 98.75
density : 1.25%

 first 5 user and movies : 


movieId,1,2,3,4,5,6,7,8,9,10
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,3.5,,,,,,,,
2,,,,,,,,,,
3,4.0,,,,,,,,,
4,,,,,,3.0,,,,4.0
5,,,,,,,,,,
6,5.0,,,,,,5.0,,,
7,,,3.0,,,,,,,
8,4.0,,5.0,,,3.0,,,,4.0
9,,,,,,,,,,
10,4.0,,,,,,,,,


In [9]:
#calculate the item based similarity

print("calculate the item based similarity")
print("this compares movies based on how user rated them")
print("please wait...")

user_item_filled=user_item_matrix.fillna(0)

#calculate the similarity between the movies
movie_similarity = cosine_similarity(user_item_filled.T)

#create DataFrame
movie_similarity_df= pd.DataFrame(
    movie_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

print(f"movies similarity matrix shape : {movie_similarity_df.shape}")
print(f"Memory usage : {movie_similarity_df.memory_usage(deep=True).sum()/1024**2:.3f} MB")

print("\n sample movies similarities: ")
print("(1.0= identical rating pattens ,  0.0 = completely different)")
display(movie_similarity_df.iloc[:5, :5])



calculate the item based similarity
this compares movies based on how user rated them
please wait...
movies similarity matrix shape : (9391, 9391)
Memory usage : 673.167 MB

 sample movies similarities: 
(1.0= identical rating pattens ,  0.0 = completely different)


movieId,1,2,3,4,5
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.328112,0.244218,0.101679,0.257931
2,0.328112,1.0,0.185232,0.128354,0.220068
3,0.244218,0.185232,1.0,0.114978,0.358623
4,0.101679,0.128354,0.114978,1.0,0.140425
5,0.257931,0.220068,0.358623,0.140425,1.0


In [10]:
#item based recommendattion function

def get_item_based_recommendation(movie_title,n_recommendations=10,min_ratings =20):
    movie_watch=movies[movies['title']==movie_title]
    if len(movie_watch)==0:
        print(f"Movie '{movie_title}' not found")
        return None
    movie_id=movie_watch['movieId'].values[0]

    if movie_id not in movie_similarity_df.index:
        print(f"movie  ID {movie_id} not collaborative filtering matrix")
        print("this movie not might have enough ratings")
        return None
    movie_genres=movie_watch['genres'].values[0]
    print(f"finding recommendations for : {movie_title}")
    print(f"genres :{movie_genres}")
    print(f"method item based collaborative filtering\n")

    similar_scores = movie_similarity_df[movie_id].sort_values(ascending=False)

    similar_scores=similar_scores[similar_scores.index != movie_id]
    top_candidates = similar_scores.head(n_recommendations*3)

    movie_stats = ratings.groupby('movieId').agg({'rating' :['mean','count']}).reset_index()
    movie_stats.columns =['movieId','avg_rating','num_ratings']

    recommendations= []
    for candidate_id, similarity_score in top_candidates.items():
        movie_info_df=movies[movies['movieId']==candidate_id]

        if len(movie_info_df)==0:
            continue

        movie_info=movie_info_df.iloc[0]

        stats = movie_stats[movie_stats['movieId'] == candidate_id]

        if len(stats)>0:
            avg_rating =stats['avg_rating'].values[0]
            num_ratings = stats['num_ratings'].values[0]

            if num_ratings >= min_ratings:
                recommendations.append({
                    'movieId' : candidate_id,
                    'title' : movie_info['title'],
                    'genres' : movie_info['genres'],
                    'similarity_score' : similarity_score,
                    'avg_rating' : round(avg_rating, 2),
                    'num_ratings' : int(num_ratings)
                })
    recommendations_df = pd.DataFrame(recommendations)
    recommendations_df = recommendations_df.sort_values(
        ['similarity_score','avg_rating'],ascending=[False,False]).head(n_recommendations)
    return recommendations_df

print("*"*50)
item_recs = get_item_based_recommendation("Toy Story (1995)",n_recommendations=10,min_ratings =50)

if item_recs is not None:
    display(item_recs)





**************************************************
finding recommendations for : Toy Story (1995)
genres :Adventure|Animation|Children|Comedy|Fantasy
method item based collaborative filtering



Unnamed: 0,movieId,title,genres,similarity_score,avg_rating,num_ratings
0,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.461608,4.19,6841
1,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,0.460272,3.36,5863
2,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,0.437713,3.99,5845
3,356,Forrest Gump (1994),Comedy|Drama|Romance|War,0.431062,4.03,8267
4,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,0.424967,3.68,5175
5,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,0.42431,3.37,4656
6,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,0.422839,3.71,3534
7,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,0.422494,3.84,2795
8,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,0.42219,3.92,5163
9,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,0.417415,3.78,4813


In [11]:
print("Calculating user based similarity ")
print("this compares users based on their rating patterns  ")
print("wait few seconds")

user_item_filled=user_item_matrix.fillna(0)
user_similarity=cosine_similarity(user_item_filled)

user_similarity_df = pd.DataFrame(user_similarity ,index=user_item_matrix.index, columns= user_item_matrix.index)

print(f"User similarity matrix shape : {user_similarity_df.shape}")
print(f"Memory usage : {movie_similarity_df.memory_usage(deep=True).sum()/1024**2:.3f} MB")

print("\n sample movies similarities: ")
display(movie_similarity_df.iloc[:5, :5])

Calculating user based similarity 
this compares users based on their rating patterns  
wait few seconds
User similarity matrix shape : (17011, 17011)
Memory usage : 673.167 MB

 sample movies similarities: 


movieId,1,2,3,4,5
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.328112,0.244218,0.101679,0.257931
2,0.328112,1.0,0.185232,0.128354,0.220068
3,0.244218,0.185232,1.0,0.114978,0.358623
4,0.101679,0.128354,0.114978,1.0,0.140425
5,0.257931,0.220068,0.358623,0.140425,1.0


In [12]:

def get_user_based_recommendation(user_id, n_recommendations=10, n_similar_user=10):
    if user_id not in user_similarity_df.index:
        print(f"user {user_id} not found in dataset")
        return None
    print(f"Finding recommendations for user {user_id}")
    print(f"method user-based collaborative filtering\n")

    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    similar_users = similar_users[similar_users.index != user_id]
    top_similar_users = similar_users.head(n_similar_user)

    print(f"Found {len(top_similar_users)} similar users")
    print(f"similarity scores range : {top_similar_users.min():.3f} to {top_similar_users.max():.3f}\n")

    # Get movies that the user has already rated (non-null ratings)
    user_rated_movies = set(user_item_matrix.loc[user_id].dropna().index)

    recommendations_dict = {}
    for similar_user_id, similarity_score in top_similar_users.items():
        similar_user_ratings = user_item_matrix.loc[similar_user_id].dropna()
        for movie_id, rating in similar_user_ratings.items():
            if movie_id in user_rated_movies:
                continue

            if movie_id not in recommendations_dict:
                recommendations_dict[movie_id] = {
                    'weighted_sum': 0,
                    'similarity_sum': 0
                }
            recommendations_dict[movie_id]['weighted_sum'] += rating * similarity_score
            recommendations_dict[movie_id]['similarity_sum'] += similarity_score

    # Calculate predicted ratings - FIXED: moved outside the loop and fixed variable naming
    predicted_ratings_list = []
    for movie_id, data in recommendations_dict.items():
        if data['similarity_sum'] > 0:
            predicted_rating_value = data['weighted_sum'] / data['similarity_sum']
            predicted_ratings_list.append({
                'movieId': movie_id,
                'predicted_rating': predicted_rating_value
            })

    if not predicted_ratings_list:
        print("No recommendations found")
        return None

    predictions_df = pd.DataFrame(predicted_ratings_list)
    predictions_df = predictions_df.sort_values('predicted_rating', ascending=False)
    top_predictions = predictions_df.head(n_recommendations)

    recommendations = []
    for _, row in top_predictions.iterrows():
        movie_id = row['movieId']
        # Fixed column name from 'moviesId' to 'movieId'
        movie_info_df = movies[movies['movieId'] == movie_id]

        if len(movie_info_df) == 0:
            continue

        movie_info = movie_info_df.iloc[0]
        movie_ratings = ratings[ratings['movieId'] == movie_id]

        recommendations.append({
            'movieId': movie_id,
            'title': movie_info['title'],
            'genres': movie_info['genres'],
            'predicted_rating': round(row['predicted_rating'], 2),
            'avg_rating': round(movie_ratings['rating'].mean(), 2),
            'num_ratings': len(movie_ratings)
        })

    if recommendations:
        recommendations_df = pd.DataFrame(recommendations)
        return recommendations_df
    else:
        print("No valid recommendations found")
        return None

# Test the function
test_user_id = train_ratings['userId'].iloc[0]

print("*"*50)
user_recs = get_user_based_recommendation(test_user_id, n_recommendations=10, n_similar_user=20)

if user_recs is not None:
    display(user_recs)

    print(f"\nMovies user {test_user_id} has already rated:")
    user_movies = ratings[ratings['userId'] == test_user_id].merge(movies, on='movieId').sort_values('rating', ascending=False).head(10)
    display(user_movies[['title', 'genres', 'rating']])


**************************************************
Finding recommendations for user 3911
method user-based collaborative filtering

Found 20 similar users
similarity scores range : 0.188 to 0.279



Unnamed: 0,movieId,title,genres,predicted_rating,avg_rating,num_ratings
0,93326.0,This Means War (2012),Action|Comedy|Romance,5.0,3.18,39
1,2193.0,Willow (1988),Action|Adventure|Fantasy,5.0,3.47,1179
2,27251.0,"10th Kingdom, The (2000)",Adventure|Comedy|Fantasy,5.0,4.12,55
3,48780.0,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller,5.0,4.03,1354
4,17.0,Sense and Sensibility (1995),Drama|Romance,5.0,3.98,2622
5,1206.0,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller,5.0,3.97,3112
6,1225.0,Amadeus (1984),Drama,5.0,4.08,2547
7,103341.0,"World's End, The (2013)",Action|Comedy|Sci-Fi,5.0,3.46,68
8,103335.0,Despicable Me 2 (2013),Animation|Children|Comedy|IMAX,5.0,3.77,102
9,103141.0,Monsters University (2013),Adventure|Animation|Comedy,5.0,3.54,93



Movies user 3911 has already rated:


Unnamed: 0,title,genres,rating
70,Remember the Titans (2000),Drama,5.0
17,Sherlock Holmes (2009),Action|Crime|Mystery|Thriller,5.0
36,Phoebe in Wonderland (2008),Drama|Fantasy,5.0
76,Sweeney Todd: The Demon Barber of Fleet Street (2007),Drama|Horror|Musical|Thriller,4.5
22,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,4.5
72,50 First Dates (2004),Comedy|Romance,4.5
18,Up in the Air (2009),Drama|Romance,4.5
16,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,4.5
15,Alice in Wonderland (2010),Adventure|Fantasy|IMAX,4.5
75,Marie Antoinette (2006),Drama|Romance,4.5


In [13]:
test_movie= "Toy Story (1995)"
print("*"*50)
print(f"COMPARING RECOMMENDATIONS FOR : {test_movie}")
print("*"*50)

try:
    with open('/Users/yugjain/Documents/Machine_learning/Movies/models/content_similarity_matrix.pkl', 'rb') as f:
        content_similarity_df = pickle.load(f)

    movie_id = movies[movies['title']==test_movie]['movieId'].values[0]

    if movie_id in  content_similarity_df.index:
        print("\n1. content based recommendations :")
        print(" (based on genre similarity)")
        print("*"*50)
        content_similar = content_similarity_df.loc[movie_id].sort_values(ascending=False)
        content_similar = content_similar[content_similar.index != movie_id].head(5)

        for i, (mid, score) in enumerate(content_similar.items(), 1):
            movie_info = movies[movies['movieId'] == mid].iloc[0]
            print(f"   {i}. {movie_info['title']}")
            print(f"      Genres: {movie_info['genres']}")
            print(f"      Similarity: {score:.3f}\n")
except:
    print("\nContent-based similarity matrix not found")

print("\n2. COLLABORATIVE FILTERING RECOMMENDATIONS:")
print("   (Based on user rating patterns)")
print("-" * 70)

if movie_id in movie_similarity_df.index:
    collab_similar = movie_similarity_df[movie_id].sort_values(ascending=False)
    collab_similar = collab_similar[collab_similar.index != movie_id].head(5)

    for i, (mid, score) in enumerate(collab_similar.items(), 1):
        movie_info_df = movies[movies['movieId'] == mid]
        if len(movie_info_df) > 0:
            movie_info = movie_info_df.iloc[0]
            print(f"   {i}. {movie_info['title']}")
            print(f"      Genres: {movie_info['genres']}")
            print(f"      Similarity: {score:.3f}\n")

print("\nKEY DIFFERENCES:")
print("  • Content-based: Recommends similar genres")
print("  • Collaborative: May recommend different genres based on user behavior")
print("  • Collaborative can discover cross-genre preferences!")




**************************************************
COMPARING RECOMMENDATIONS FOR : Toy Story (1995)
**************************************************

1. content based recommendations :
 (based on genre similarity)
**************************************************
   1. Tale of Despereaux, The (2008)
      Genres: Adventure|Animation|Children|Comedy|Fantasy
      Similarity: 1.000

   2. DuckTales: The Movie - Treasure of the Lost Lamp (1990)
      Genres: Adventure|Animation|Children|Comedy|Fantasy
      Similarity: 1.000

   3. Turbo (2013)
      Genres: Adventure|Animation|Children|Comedy|Fantasy
      Similarity: 1.000

   4. Monsters, Inc. (2001)
      Genres: Adventure|Animation|Children|Comedy|Fantasy
      Similarity: 1.000

   5. Antz (1998)
      Genres: Adventure|Animation|Children|Comedy|Fantasy
      Similarity: 1.000


2. COLLABORATIVE FILTERING RECOMMENDATIONS:
   (Based on user rating patterns)
----------------------------------------------------------------------
   

In [14]:
#evaluate the collaborative filtering

print("Evaluate Collaboarative filtering")
print("*"*50)
def predict_ratings(user_id, movie_id,k=10):
    if user_id == test_user_id:
        return None
    if movie_id not in movie_similarity_df.columns:
        return None

    user_ratings = user_item_matrix.loc[user_id].dropna()

    similar_movies = movie_similarity_df[movie_id]

    common_movies = user_ratings.index.intersection(similar_movies.index)
    if len(common_movies)==0:
        return None

    similar_scores = similar_movies[common_movies].sort_values(ascending = False).head(k)

    numerator = sum(user_ratings[m] * similar_scores[m] for m in similar_scores.index)
    denominator = sum(abs(similar_scores[m]) for m in similar_scores.index)

    if denominator == 0:
        return None

    predicted = numerator/denominator

    return predicted


print("Evaluating on the sample of test set ..\n")

predictions = []
actuals = []

test_sample = test_ratings.sample(min(1000,len(test_ratings)), random_state = 42)
for idx , row in test_sample.iterrows():
    pred= predict_ratings(row['userId'],row['movieId'],k=10)

    if pred is not None:
        predictions . append(pred)
        actuals.append(row['rating'])

if len(predictions)>0:

    rmse = sqrt(mean_squared_error(actuals,predictions))
    mae = mean_absolute_error(actuals,predictions)

    print(f"Evaluation result : ")
    print(f" Predictions made : {len(predictions):,} out of {len(test_sample):,}")

    print(f" RMSE : {rmse:.4f}")
    print(f" MAE : {mae:.4f}")
    print(f"\n Interpretation : ")
    print(f"ON average, predictions are OFF by {mae:.2f} stars")
else:
    print("Could not make prediction - not enough overlap in data")



Evaluate Collaboarative filtering
**************************************************
Evaluating on the sample of test set ..

Evaluation result : 
 Predictions made : 1,000 out of 1,000
 RMSE : 0.8743
 MAE : 0.6573

 Interpretation : 
ON average, predictions are OFF by 0.66 stars


In [15]:
def get_hybrid_recommendations(movie_title = None, user_id = None,n_recommendation=10,content_weight = 0.5):

    if movie_title is None and user_id is None:
        print("Provide either movie_title or user_id")
        return None

    recommendations={}

    if movie_title is not None:
        try:
            with open('/Users/yugjain/Documents/Machine_learning/Movies/models/content_similarity_matrix.pkl','rb') as f:
                content_sim = pickle.load(f)

            movie_id = movies[movies['title']==movie_title]['movieId'].values[0]

            if movie_id in content_sim.index:
                content_scores = content_sim.loc[movie_id]
                content_scores = content_scores[content_scores.index != movie_id]

                for mid,score in content_scores.items():
                    recommendations[mid] = recommendations.get(mid,0)+ score * content_weight

        except:
            pass



       # collaborative components

        if movie_title is not  None:
             movie_id = movies[movies['title'] ==movie_title]['movieId'].values[0]
             if movie_id in movie_similarity_df.columns:
                   collab_scores = movie_similarity_df[movie_id]
                   collab_scores = collab_scores[collab_scores.index != movie_id]

                   for mid, score in collab_scores.items():
                          recommendations[mid] = recommendations.get(mid, 0) + score * (1 - content_weight)

        #convert to list and sort

        rec_list = [(mid,score) for mid,score in recommendations.items()]
        rec_list.sort(key=lambda x: x[1], reverse=True)

        #make dataframe of recommendations
        final_recs = []
        for movie_id, hybrid_score in rec_list[:n_recommendation]:
                    movie_info_df = movies[movies['movieId'] == movie_id]
                    if len(movie_info_df) == 0:
                         continue
                    movie_info = movie_info_df.iloc[0]
                    movie_ratings = ratings[ratings['movieId'] == movie_id]
                    final_recs.append({
                                   'movieId': movie_id,
                                   'title': movie_info['title'],
                                   'genres': movie_info['genres'],
                                   'hybrid_score': round(hybrid_score, 3),
                                   'avg_rating': round(movie_ratings['rating'].mean(), 2) if len(movie_ratings) > 0 else 0,
                                   'num_ratings': len(movie_ratings)
                    })
        return pd.DataFrame(final_recs)

print("*"*50)
print("Hybrid recommendations 50% content + 50% collaborative)")
print("*"*50)

hyd_recs=get_hybrid_recommendations("Toy Story (1995)", content_weight=0.5,n_recommendation=10)

if hyd_recs is not None:
    display(hyd_recs)







**************************************************
Hybrid recommendations 50% content + 50% collaborative)
**************************************************


Unnamed: 0,movieId,title,genres,hybrid_score,avg_rating,num_ratings
0,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,0.711,3.84,2795
1,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,0.687,3.88,2908
2,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Romance,0.647,3.83,3994
3,2294,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,0.636,3.34,1275
4,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,0.633,3.87,2923
5,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,0.63,3.6,2531
6,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,0.612,3.68,5175
7,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,0.595,3.62,724
8,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fantasy|Mystery,0.586,3.53,2742
9,5218,Ice Age (2002),Adventure|Animation|Children|Comedy,0.579,3.62,1422


In [16]:
print("Saving collaborative filtering models...\n")

with open('/Users/yugjain/Documents/Machine_learning/Movies/models/item_similarity_matrix.pkl', 'wb') as f:
    pickle.dump(movie_similarity_df, f)
print("✓ Saved: models/item_similarity_matrix.pkl")

with open('/Users/yugjain/Documents/Machine_learning/Movies/models/user_similarity_matrix.pkl', 'wb') as f:
    pickle.dump(user_similarity_df, f)
print("✓ Saved: models/user_similarity_matrix.pkl")

with open('/Users/yugjain/Documents/Machine_learning/Movies/models/user_item_matrix.pkl', 'wb') as f:
    pickle.dump(user_item_matrix, f)
print("✓ Saved: models/user_item_matrix.pkl")

print(f"\nModel sizes:")
print(f"  Item similarity: {movie_similarity_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  User similarity: {user_similarity_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Saving collaborative filtering models...

✓ Saved: models/item_similarity_matrix.pkl
✓ Saved: models/user_similarity_matrix.pkl
✓ Saved: models/user_item_matrix.pkl

Model sizes:
  Item similarity: 673.17 MB
  User similarity: 2208.38 MB


In [None]:
print("*" * 50)
print("COLLABORATIVE FILTERING - SUMMARY")
print("*" * 50)

print("\nMethods Implemented:")
print("  1. Item-Based Collaborative Filtering")
print("      Finds movies with similar rating patterns")
print("      'Users who liked this also liked...'")
print()
print("  2. User-Based Collaborative Filtering")
print("     Finds similar users")
print("     'Users like you enjoyed...'")
print()
print("  3. Hybrid Approach")
print("     Combines content-based + collaborative")
print("     Best of both worlds!")

print(f"\nDataset:")
print(f"  Users: {user_item_matrix.shape[0]:,}")
print(f"  Movies: {user_item_matrix.shape[1]:,}")
print(f"  Sparsity: {sparsity:.2f}%")

print(f"\nStrengths:")
print("  Can recommend across different genres")
print("  Discovers unexpected preferences")
print("  improves over time as more ratings come in")
print("  No need for movie metadata")

print(f"\nLimitations:")
print("  Cold start problem (new users/movies)")
print("  Sparsity issues (most cells empty)")
print("  Scalability challenges with many users/items")
print("  Popularity bias")

print(f"\nFunctions available:")
print("  get_item_based_recommendations(movie_title, n)")
print("  get_user_based_recommendations(user_id, n)")
print("  get_hybrid_recommendations(movie_title, user_id, weight)")

print("\n" + "*" * 50)
print("PROJECT COMPLETE! You now have a full recommendation system!")
print("*" * 50)