# Movie Recommendation System (User-Based Collaborative Filtering)



## Steps Covered:
1. Load & preprocess dataset
2. Create a user-item matrix
3. Compute user-user similarity (cosine similarity)
4. Generate top-N recommendations for a user
5. Bonus: Implement item-based collaborative filtering (optional)

---


In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [20]:
# Synthetic dataset for demonstration


import kagglehub

# Download latest version
dataset_path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")

print("Path to dataset files:", dataset_path)

# Read the data into a pandas DataFrame
# The actual data file is typically inside the downloaded dataset folder
ratings_df = pd.read_csv(f"{dataset_path}/ml-100k/u.data", sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Drop the timestamp column as it's not needed for this task
ratings_df = ratings_df.drop('timestamp', axis=1)

print("First 5 rows of the ratings DataFrame:")
display(ratings_df.head())


# np.random.seed(42)
# num_users = 15
# num_movies = 12

# users = [f"User_{i}" for i in range(1, num_users+1)]
# movies = [f"Movie_{j}" for j in range(1, num_movies+1)]

# ratings_data = []
# for user in users:
#     for movie in movies:
#         if np.random.rand() < 0.6:
#             ratings_data.append([user, movie, np.random.randint(1, 6)])

# ratings_df = pd.DataFrame(ratings_data, columns=["user_id", "movie_id", "rating"])
# ratings_df.head()

Path to dataset files: /kaggle/input/movielens-100k-dataset
First 5 rows of the ratings DataFrame:


Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [21]:
# Create user-item matrix
user_item_matrix = ratings_df.pivot_table(index="user_id", columns="movie_id", values="rating")
print("First 5 rows of the user-item matrix:")
display(user_item_matrix.head())

First 5 rows of the user-item matrix:


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [22]:
# Fill missing ratings with 0 for similarity computation
user_item_filled = user_item_matrix.fillna(0)

# Compute cosine similarity between users
user_similarity = pd.DataFrame(
    cosine_similarity(user_item_filled),
    index=user_item_filled.index,
    columns=user_item_filled.index
)

user_similarity.head()


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [23]:
def recommend_movies(user_id, top_n=5):
    sim_scores = user_similarity[user_id].drop(user_id)
    weighted_scores = {}

    unrated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].isna()].index

    for other_user, sim in sim_scores.items():
        for movie in unrated_movies:
            if not np.isnan(user_item_matrix.loc[other_user, movie]):
                weighted_scores[movie] = weighted_scores.get(movie, 0) + sim * user_item_matrix.loc[other_user, movie]

    recommended = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [movie for movie, score in recommended]


In [25]:
# Test the recommendation function with a user from the dataset
user_id_to_recommend = ratings_df['user_id'].iloc[0] # Get the first user ID from the dataset
recommendations = recommend_movies(user_id_to_recommend, top_n=5)
print(f"Recommended movies for user {user_id_to_recommend}:", recommendations)

Recommended movies for user 196: [50, 100, 181, 174, 127]


In [26]:
# Test the recommendation function with a user from the dataset
user_id_to_recommend = ratings_df['user_id'].iloc[0] # Get the first user ID from the dataset
recommendations = recommend_movies(user_id_to_recommend, top_n=10)
print(f"Recommended movies for user {user_id_to_recommend}:", recommendations)

Recommended movies for user 196: [50, 100, 181, 174, 127, 1, 258, 98, 56, 172]


In [27]:
# Fill missing ratings with 0 for similarity computation
user_item_filled = user_item_matrix.fillna(0)

# Compute cosine similarity between users
user_similarity = pd.DataFrame(
    cosine_similarity(user_item_filled),
    index=user_item_filled.index,
    columns=user_item_filled.index
)

display(user_similarity.head())

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [28]:
def recommend_movies(user_id, top_n=5):
    sim_scores = user_similarity[user_id].drop(user_id)
    weighted_scores = {}

    unrated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].isna()].index

    for other_user, sim in sim_scores.items():
        if sim > 0: # Consider users with positive similarity
            for movie in unrated_movies:
                if not np.isnan(user_item_matrix.loc[other_user, movie]):
                    weighted_scores[movie] = weighted_scores.get(movie, 0) + sim * user_item_matrix.loc[other_user, movie]

    recommended = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [movie for movie, score in recommended]

In [29]:
# Test the recommendation function with a user from the dataset
user_id_to_recommend = ratings_df['user_id'].iloc[0] # Get the first user ID from the dataset
recommendations = recommend_movies(user_id_to_recommend, top_n=10)
print(f"Recommended movies for user {user_id_to_recommend}:", recommendations)

Recommended movies for user 196: [50, 100, 181, 174, 127, 1, 258, 98, 56, 172]
