# Movie Recommandation System.
## Dataset Link: https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset?select=movie.csv


This notebook implements a complete movie recommendation system using:
1. Collaborative Filtering (Item-based)
2. Matrix Factorization (SVD)
3. Evaluation Metrics (Precision@K, Recall@K, NDCG)

## Importing all the Libraries

In [59]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')

## Load and Explore the Dataset

In [23]:
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('rating.csv', 
                      usecols=['userId', 'movieId', 'rating'],
                      dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [24]:
movies.shape

(27278, 3)

In [25]:
ratings.shape

(20000263, 3)

In [26]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


### Some basic Statistics

In [28]:
print(f"Number of unique users: {ratings['userId'].nunique()}")
print(f"Number of unique movies: {ratings['movieId'].nunique()}")
print(f"Rating range: {ratings['rating'].min()} to {ratings['rating'].max()}")
print(f"Average rating: {ratings['rating'].mean():.2f}")
print(f"Sparsity: {(1 - len(ratings) / (ratings['userId'].nunique() * ratings['movieId'].nunique())) * 100:.2f}%")

Number of unique users: 138493
Number of unique movies: 26744
Rating range: 0.5 to 5.0
Average rating: 3.53
Sparsity: 99.46%


In [29]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [30]:
ratings.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

# Data Preprocessing

In [32]:
# Sample data for memory efficiency (use 10% of data)

SAMPLE_FRACTION = 0.1  # Use 10% of data

In [33]:
# Sample users while maintaining diversity
n_users_sample = int(ratings['userId'].nunique() * SAMPLE_FRACTION)
sampled_users = np.random.choice(ratings['userId'].unique(), 
                                  n_users_sample, 
                                  replace=False)

In [34]:
ratings = ratings[ratings['userId'].isin(sampled_users)]

In [35]:
# Filter movies that have at least 5 ratings (remove sparse data)
movie_counts = ratings['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 5].index
ratings = ratings[ratings['movieId'].isin(popular_movies)]
print(f"After filtering: {len(ratings)} ratings")

After filtering: 1991225 ratings


In [36]:
# Filter users who have rated at least 5 movies
user_counts = ratings['userId'].value_counts()
active_users = user_counts[user_counts >= 5].index
ratings = ratings[ratings['userId'].isin(active_users)]
print(f"Final dataset: {len(ratings)} ratings from {len(active_users)} users")

Final dataset: 1991225 ratings from 13849 users


## TRAIN-TEST SPLIT

In [37]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
print(f"Train set: {len(train_data)} ratings")
print(f"est set: {len(test_data)} ratings")

Train set: 1592980 ratings
est set: 398245 ratings


## CREATE USER-ITEM MATRIX

In [38]:
user_ids = train_data['userId'].unique()
movie_ids = train_data['movieId'].unique()

In [39]:
user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
idx_to_user = {idx: user_id for user_id, idx in user_to_idx.items()}
idx_to_movie = {idx: movie_id for movie_id, idx in movie_to_idx.items()}

In [40]:
# Create sparse matrix
row_indices = train_data['userId'].map(user_to_idx).values
col_indices = train_data['movieId'].map(movie_to_idx).values
ratings_values = train_data['rating'].values

In [41]:
user_item_matrix = csr_matrix(
    (ratings_values, (row_indices, col_indices)),
    shape=(len(user_ids), len(movie_ids))
)

In [42]:
user_item_matrix.shape

(13849, 11048)

## ITEM-BASED COLLABORATIVE FILTERING

In [43]:
# Transpose matrix to get item-item relationships
item_similarity = cosine_similarity(user_item_matrix.T, dense_output=False)

## MATRIX FACTORIZATION (SVD)

In [46]:
# Use TruncatedSVD for dimensionality reduction
n_components = 50  # Latent factors
svd_model = TruncatedSVD(n_components=n_components, random_state=42)
user_factors = svd_model.fit_transform(user_item_matrix)
movie_factors = svd_model.components_.T

print(f"SVD model trained with {n_components} latent factors")
print(f"Explained variance: {svd_model.explained_variance_ratio_.sum():.4f}")

SVD model trained with 50 latent factors
Explained variance: 0.3110


## RECOMMENDATION FUNCTIONS

In [47]:
def recommend_movies_collaborative(user_id, N=10):
    """
    Item-Based Collaborative Filtering Recommendations
    
    Args:
        user_id: User ID to generate recommendations for
        N: Number of recommendations to return
        
    Returns:
        List of tuples (movie_id, movie_title, predicted_score)
    """
    if user_id not in user_to_idx:
        return f"User {user_id} not found in training data"
    
    user_idx = user_to_idx[user_id]
    user_ratings = user_item_matrix[user_idx].toarray().flatten()
    
    # Get movies not rated by user
    unrated_movies = np.where(user_ratings == 0)[0]
    
    # Calculate predicted ratings for unrated movies
    predictions = []
    for movie_idx in unrated_movies:
        # Get similar movies that user has rated
        similar_movies = item_similarity[movie_idx].toarray().flatten()
        rated_movies = np.where(user_ratings > 0)[0]
        
        # Weighted average of similar movies
        numerator = np.sum(similar_movies[rated_movies] * user_ratings[rated_movies])
        denominator = np.sum(np.abs(similar_movies[rated_movies]))
        
        if denominator > 0:
            predicted_rating = numerator / denominator
            predictions.append((movie_idx, predicted_rating))
    
    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Get top N recommendations
    top_n = predictions[:N]
    
    # Convert to movie IDs and titles
    recommendations = []
    for movie_idx, score in top_n:
        movie_id = idx_to_movie[movie_idx]
        movie_title = movies[movies['movieId'] == movie_id]['title'].values
        if len(movie_title) > 0:
            recommendations.append((movie_id, movie_title[0], score))
    
    return recommendations

In [48]:
def recommend_movies_svd(user_id, N=10):
    """
    SVD-Based Matrix Factorization Recommendations
    
    Args:
        user_id: User ID to generate recommendations for
        N: Number of recommendations to return
        
    Returns:
        List of tuples (movie_id, movie_title, predicted_score)
    """
    if user_id not in user_to_idx:
        return f"User {user_id} not found in training data"
    
    user_idx = user_to_idx[user_id]
    user_ratings = user_item_matrix[user_idx].toarray().flatten()
    
    # Predict ratings using SVD
    predicted_ratings = user_factors[user_idx] @ movie_factors.T
    
    # Get movies not rated by user
    unrated_movies = np.where(user_ratings == 0)[0]
    
    # Get predictions for unrated movies
    predictions = [(movie_idx, predicted_ratings[movie_idx]) 
                   for movie_idx in unrated_movies]
    
    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Get top N recommendations
    top_n = predictions[:N]
    
    # Convert to movie IDs and titles
    recommendations = []
    for movie_idx, score in top_n:
        movie_id = idx_to_movie[movie_idx]
        movie_title = movies[movies['movieId'] == movie_id]['title'].values
        if len(movie_title) > 0:
            recommendations.append((movie_id, movie_title[0], score))
    
    return recommendations

In [49]:
def recommend_movies(user_id, N=10, method='svd'):
    """
    Main recommendation function - Supports both methods
    
    Args:
        user_id: User ID to generate recommendations for
        N: Number of recommendations to return
        method: 'svd' or 'collaborative'
        
    Returns:
        List of tuples (movie_id, movie_title, predicted_score)
    """
    if method == 'svd':
        return recommend_movies_svd(user_id, N)
    elif method == 'collaborative':
        return recommend_movies_collaborative(user_id, N)
    else:
        return "Invalid method. Choose 'svd' or 'collaborative'"

## EVALUATION METRICS

In [51]:
## Precision

def precision_at_k(recommended, relevant, k):
    """Calculate Precision@K"""
    recommended_k = set(recommended[:k])
    relevant_set = set(relevant)
    return len(recommended_k & relevant_set) / k if k > 0 else 0

In [52]:
## Recall

def recall_at_k(recommended, relevant, k):
    """Calculate Recall@K"""
    recommended_k = set(recommended[:k])
    relevant_set = set(relevant)
    return len(recommended_k & relevant_set) / len(relevant_set) if len(relevant_set) > 0 else 0

In [53]:
def ndcg_at_k(recommended, relevant, k):
    """Calculate NDCG@K"""
    dcg = 0
    for i, item in enumerate(recommended[:k]):
        if item in relevant:
            dcg += 1 / np.log2(i + 2)
    
    idcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])
    return dcg / idcg if idcg > 0 else 0

In [55]:
def evaluate_model(method='svd', k=10, n_users=100):
    """Evaluate model on test set"""
    precisions, recalls, ndcgs = [], [], []
    
    # Sample users for evaluation
    test_users = test_data['userId'].unique()
    eval_users = np.random.choice(test_users, min(n_users, len(test_users)), replace=False)
    
    for user_id in eval_users:
        if user_id not in user_to_idx:
            continue
        
        # Get test set movies for this user (relevant items)
        relevant_movies = test_data[
            (test_data['userId'] == user_id) & 
            (test_data['rating'] >= 4.0)  # Consider 4+ as relevant
        ]['movieId'].values
        
        if len(relevant_movies) == 0:
            continue
        
        # Get recommendations
        recommendations = recommend_movies(user_id, N=k, method=method)
        if isinstance(recommendations, str):
            continue
        
        recommended_movies = [rec[0] for rec in recommendations]
        
        # Calculate metrics
        precisions.append(precision_at_k(recommended_movies, relevant_movies, k))
        recalls.append(recall_at_k(recommended_movies, relevant_movies, k))
        ndcgs.append(ndcg_at_k(recommended_movies, relevant_movies, k))
    
    return {
        'Precision@K': np.mean(precisions),
        'Recall@K': np.mean(recalls),
        'NDCG@K': np.mean(ndcgs)
    }


# Evaluate both models
print("\nEvaluating SVD Model")
svd_metrics = evaluate_model(method='svd', k=10, n_users=100)
print(f"VD Results: Precision@10={svd_metrics['Precision@K']:.4f}, "
      f"Recall@10={svd_metrics['Recall@K']:.4f}, NDCG@10={svd_metrics['NDCG@K']:.4f}")

print("\nEvaluating Collaborative Filtering Model")
cf_metrics = evaluate_model(method='collaborative', k=10, n_users=50)  # Slower, fewer users
print(f"Collaborative Filtering Results: Precision@10={cf_metrics['Precision@K']:.4f}, "
      f"Recall@10={cf_metrics['Recall@K']:.4f}, NDCG@10={cf_metrics['NDCG@K']:.4f}")


Evaluating SVD Model
VD Results: Precision@10=0.2227, Recall@10=0.2348, NDCG@10=0.3009

Evaluating Collaborative Filtering Model
Collaborative Filtering Results: Precision@10=0.0000, Recall@10=0.0000, NDCG@10=0.0000


## GENERATING RECOMMENDATIONS

In [56]:
demo_user = np.random.choice(list(user_to_idx.keys()))
print(f"\nGenerating recommendations for User ID: {demo_user}")

# SVD Recommendations
print("\n--- SVD Method ---")
svd_recs = recommend_movies(demo_user, N=10, method='svd')
for i, (movie_id, title, score) in enumerate(svd_recs, 1):
    print(f"{i}. {title} (Score: {score:.3f})")



Generating recommendations for User ID: 114618

--- SVD Method ---
1. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) (Score: 1.551)
2. Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000) (Score: 1.219)
3. Almost Famous (2000) (Score: 1.200)
4. Star Wars: Episode IV - A New Hope (1977) (Score: 1.195)
5. Requiem for a Dream (2000) (Score: 0.997)
6. Royal Tenenbaums, The (2001) (Score: 0.959)
7. Gladiator (2000) (Score: 0.942)
8. Star Wars: Episode VI - Return of the Jedi (1983) (Score: 0.931)
9. Cast Away (2000) (Score: 0.872)
10. Run Lola Run (Lola rennt) (1998) (Score: 0.808)


In [57]:
# Collaborative Filtering Recommendations
print("\n--- Collaborative Filtering Method ---")
cf_recs = recommend_movies(demo_user, N=10, method='collaborative')
for i, (movie_id, title, score) in enumerate(cf_recs, 1):
    print(f"{i}. {title} (Score: {score:.3f})")



--- Collaborative Filtering Method ---
1. Spanish Fly (1998) (Score: 4.398)
2. Taste of Tea, The (Cha no aji) (2004) (Score: 4.103)
3. a/k/a Tommy Chong (2005) (Score: 4.000)
4. Adjuster, The (1991) (Score: 3.984)
5. Boys, Les (1997) (Score: 3.981)
6. Krakatoa, East of Java (1969) (Score: 3.949)
7. Delta of Venus (1995) (Score: 3.931)
8. Last Command, The (1928) (Score: 3.931)
9. I Was Born, But... (a.k.a. Children of Tokyo) (Otona no miru ehon - Umarete wa mita keredo) (1932) (Score: 3.883)
10. Archangel (1990) (Score: 3.866)


## MODEL

In [60]:
model_artifacts = {
    'svd_model': svd_model,
    'user_factors': user_factors,
    'movie_factors': movie_factors,
    'item_similarity': item_similarity,
    'user_to_idx': user_to_idx,
    'movie_to_idx': movie_to_idx,
    'idx_to_user': idx_to_user,
    'idx_to_movie': idx_to_movie,
    'user_item_matrix': user_item_matrix,
    'movies': movies,
    'svd_metrics': svd_metrics,
    'cf_metrics': cf_metrics
}

with open('movie_recommender_model.pkl', 'wb') as f:
    pickle.dump(model_artifacts, f)