In [76]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
import warnings
warnings.filterwarnings('ignore')

In [77]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
links = pd.read_csv('links.csv')

In [78]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [79]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [80]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [81]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [52]:
print("Shape of movies dataframe:", movies.shape)
print("Shape of ratings dataframe:", ratings.shape)
print("Shape of tags dataframe:", tags.shape)
print("Shape of links dataframe:", links.shape)

Shape of movies dataframe: (9742, 3)
Shape of ratings dataframe: (100836, 3)
Shape of tags dataframe: (3683, 3)
Shape of links dataframe: (9742, 3)


In [53]:
print("\n movies info:")
movies.info()
print("\n links info:")
links.info()
print("\n tags info:")
tags.info()
print("\n ratings info:")
ratings.info()


 movies info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB

 links info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB

 tags info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   userId   3683 non-null   int64 
 1   movieId  3683 non-null   int64 
 2   tag     

In [54]:
# Extract year from title
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')
movies['year'] = movies['year'].fillna(0).astype(int)

In [55]:
# Create user-item matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')

# Fill NaN values with 0 (indicating no rating)
user_item_matrix = user_item_matrix.fillna(0)

print(f"\nUser-item matrix shape: {user_item_matrix.shape}")


User-item matrix shape: (610, 9724)


## use collaborative system

In [62]:
def calculate_user_similarity(user_item_matrix):
    """Calculate cosine similarity between users"""
    # Normalize the ratings by subtracting each user's mean rating
    user_mean_ratings = user_item_matrix.mean(axis=1)
    user_item_normalized = user_item_matrix.sub(user_mean_ratings, axis=0)
    user_item_normalized = user_item_normalized.fillna(0)

    # Calculate cosine similarity
    user_similarity = cosine_similarity(user_item_normalized)
    user_similarity_df = pd.DataFrame(user_similarity,
                                     index=user_item_matrix.index,
                                     columns=user_item_matrix.index)
    return user_similarity_df, user_item_normalized

def predict_ratings_user_based(user_id, user_item_matrix, user_similarity_df, n_similar_users=10):
    """Predict ratings for all movies for a given user using user-based collaborative filtering"""
    # Get the user's ratings
    user_ratings = user_item_matrix.loc[user_id]

    # Get similar users (excluding the user themselves)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:n_similar_users+1]

    # Calculate weighted average of similar users' ratings
    numerator = 0
    denominator = 0

    for other_user in similar_users:
        similarity = user_similarity_df.loc[user_id, other_user]
        other_user_ratings = user_item_matrix.loc[other_user]

        numerator += similarity * other_user_ratings
        denominator += abs(similarity)

    # Avoid division by zero
    predicted_ratings = numerator / denominator if denominator != 0 else 0

    return predicted_ratings

def recommend_movies_user_based(user_id, user_item_matrix, user_similarity_df, movies_df, n_recommendations=10):
    """Generate movie recommendations for a user using user-based collaborative filtering"""
    # Get predicted ratings
    predicted_ratings = predict_ratings_user_based(user_id, user_item_matrix, user_similarity_df)

    # Get movies the user hasn't rated
    user_ratings = user_item_matrix.loc[user_id]
    unseen_movies = user_ratings[user_ratings == 0].index

    # Get predicted ratings for unseen movies
    predicted_unseen = predicted_ratings[unseen_movies]

    # Get top N recommendations
    top_recommendations = predicted_unseen.sort_values(ascending=False).head(n_recommendations)

    # Get movie titles and genres
    recommendations_with_details = []
    for movie_id, predicted_rating in top_recommendations.items():
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        if not movie_info.empty:
            movie_title = movie_info['title'].values[0]
            movie_genres = movie_info['genres'].values[0]
            recommendations_with_details.append((movie_title, movie_genres, predicted_rating))

    return recommendations_with_details

# Calculate user similarity
user_similarity_df, user_item_normalized = calculate_user_similarity(user_item_matrix)

# Test with a sample user
sample_user_id = 1
user_based_recommendations = recommend_movies_user_based(sample_user_id, user_item_matrix, user_similarity_df, movies)
print(f"\nUser-based recommendations for user {sample_user_id}:")
for i, (title, genres, rating) in enumerate(user_based_recommendations, 1):
    print(f"{i}. {title} | Genres: {genres} | Predicted rating: {rating:.2f}")


User-based recommendations for user 1:
1. Aliens (1986) | Genres: Action|Adventure|Horror|Sci-Fi | Predicted rating: 4.32
2. Godfather, The (1972) | Genres: Crime|Drama | Predicted rating: 4.26
3. Terminator 2: Judgment Day (1991) | Genres: Action|Sci-Fi | Predicted rating: 4.20
4. Die Hard (1988) | Genres: Action|Crime|Thriller | Predicted rating: 4.00
5. Sixth Sense, The (1999) | Genres: Drama|Horror|Mystery | Predicted rating: 3.96
6. Hunt for Red October, The (1990) | Genres: Action|Adventure|Thriller | Predicted rating: 3.96
7. Breakfast Club, The (1985) | Genres: Comedy|Drama | Predicted rating: 3.67
8. Godfather: Part II, The (1974) | Genres: Crime|Drama | Predicted rating: 3.58
9. Star Trek II: The Wrath of Khan (1982) | Genres: Action|Adventure|Sci-Fi|Thriller | Predicted rating: 3.54
10. Gattaca (1997) | Genres: Drama|Sci-Fi|Thriller | Predicted rating: 3.51


## Item-Based Collaborative Filtering

In [63]:
def calculate_item_similarity(user_item_matrix):
    """Calculate cosine similarity between items"""
    # Transpose the matrix to get item-user matrix
    item_user_matrix = user_item_matrix.T

    # Calculate cosine similarity
    item_similarity = cosine_similarity(item_user_matrix)
    item_similarity_df = pd.DataFrame(item_similarity,
                                     index=item_user_matrix.index,
                                     columns=item_user_matrix.index)
    return item_similarity_df

def predict_ratings_item_based(user_id, user_item_matrix, item_similarity_df, n_similar_items=5):
    """Predict ratings for all movies for a given user using item-based collaborative filtering"""
    # Get the user's ratings
    user_ratings = user_item_matrix.loc[user_id]

    # Get movies the user hasn't rated
    unseen_movies = user_ratings[user_ratings == 0].index

    # Predict ratings for unseen movies
    predicted_ratings = {}

    for movie_id in unseen_movies:
        # Get similar items that the user has rated
        similar_items = item_similarity_df[movie_id].sort_values(ascending=False).index[1:n_similar_items+1]
        rated_similar_items = [item for item in similar_items if user_ratings[item] > 0]

        if not rated_similar_items:
            continue

        # Calculate weighted average
        numerator = 0
        denominator = 0

        for similar_item in rated_similar_items:
            similarity = item_similarity_df.loc[movie_id, similar_item]
            rating = user_ratings[similar_item]

            numerator += similarity * rating
            denominator += abs(similarity)

        if denominator != 0:
            predicted_ratings[movie_id] = numerator / denominator

    return predicted_ratings

def recommend_movies_item_based(user_id, user_item_matrix, item_similarity_df, movies_df, n_recommendations=10):
    """Generate movie recommendations for a user using item-based collaborative filtering"""
    # Get predicted ratings
    predicted_ratings = predict_ratings_item_based(user_id, user_item_matrix, item_similarity_df)

    # Get top N recommendations
    top_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]

    # Get movie titles and genres
    recommendations_with_details = []
    for movie_id, predicted_rating in top_recommendations:
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        if not movie_info.empty:
            movie_title = movie_info['title'].values[0]
            movie_genres = movie_info['genres'].values[0]
            recommendations_with_details.append((movie_title, movie_genres, predicted_rating))

    return recommendations_with_details

# Calculate item similarity
item_similarity_df = calculate_item_similarity(user_item_normalized)

# Test with a sample user
item_based_recommendations = recommend_movies_item_based(sample_user_id, user_item_matrix, item_similarity_df, movies)
print(f"\nItem-based recommendations for user {sample_user_id}:")
for i, (title, genres, rating) in enumerate(item_based_recommendations, 1):
    print(f"{i}. {title} | Genres: {genres} | Predicted rating: {rating:.2f}")


Item-based recommendations for user 1:
1. Cinderella (1950) | Genres: Animation|Children|Fantasy|Musical|Romance | Predicted rating: 5.00
2. Fox and the Hound, The (1981) | Genres: Animation|Children|Drama | Predicted rating: 5.00
3. Sound of Music, The (1965) | Genres: Musical|Romance | Predicted rating: 5.00
4. Lady and the Tramp (1955) | Genres: Animation|Children|Comedy|Romance | Predicted rating: 5.00
5. 101 Dalmatians (One Hundred and One Dalmatians) (1961) | Genres: Adventure|Animation|Children | Predicted rating: 5.00
6. Rescuers Down Under, The (1990) | Genres: Adventure|Animation|Children | Predicted rating: 5.00
7. Little Shop of Horrors (1986) | Genres: Comedy|Horror|Musical | Predicted rating: 5.00
8. Sleepy Hollow (1999) | Genres: Fantasy|Horror|Mystery|Romance | Predicted rating: 5.00
9. Animal House (1978) | Genres: Comedy | Predicted rating: 5.00
10. Naked Gun: From the Files of Police Squad!, The (1988) | Genres: Action|Comedy|Crime|Romance | Predicted rating: 5.00


## Matrix Factorization with SVD

In [66]:
def svd_recommendations(user_item_matrix, movies_df, user_id, n_recommendations=10, n_factors=50):
    """Generate recommendations using SVD matrix factorization"""
    # Convert to numpy matrix
    R = user_item_matrix.values

    # Normalize by subtracting user means
    user_ratings_mean = np.mean(R, axis=1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)

    # Perform SVD
    U, sigma, Vt = svds(R_demeaned, k=n_factors)

    # Convert sigma to diagonal matrix
    sigma = np.diag(sigma)

    # Make predictions
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    preds_df = pd.DataFrame(all_user_predicted_ratings,
                           index=user_item_matrix.index,
                           columns=user_item_matrix.columns)

    # Get predictions for the user
    user_predicted_ratings = preds_df.loc[user_id]

    # Get movies the user hasn't rated
    user_ratings = user_item_matrix.loc[user_id]
    unseen_movies = user_ratings[user_ratings == 0].index

    # Get top recommendations
    recommendations = user_predicted_ratings[unseen_movies].sort_values(ascending=False).head(n_recommendations)

    # Get movie titles and genres
    recommendations_with_details = []
    for movie_id, predicted_rating in recommendations.items():
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        if not movie_info.empty:
            movie_title = movie_info['title'].values[0]
            movie_genres = movie_info['genres'].values[0]
            recommendations_with_details.append((movie_title, movie_genres, predicted_rating))

    return recommendations_with_details

    # Test SVD recommendations
svd_recommendations_list = svd_recommendations(user_item_matrix, movies, sample_user_id)
print(f"\nSVD-based recommendations for user {sample_user_id}:")
for i, (title, genres, rating) in enumerate(svd_recommendations_list, 1):
    print(f"{i}. {title} | Genres: {genres} | Predicted rating: {rating:.2f}")



SVD-based recommendations for user 1:
1. Die Hard (1988) | Genres: Action|Crime|Thriller | Predicted rating: 4.02
2. Godfather: Part II, The (1974) | Genres: Crime|Drama | Predicted rating: 3.32
3. Jaws (1975) | Genres: Action|Horror | Predicted rating: 3.30
4. Godfather, The (1972) | Genres: Crime|Drama | Predicted rating: 2.89
5. Breakfast Club, The (1985) | Genres: Comedy|Drama | Predicted rating: 2.87
6. Stand by Me (1986) | Genres: Adventure|Drama | Predicted rating: 2.79
7. Christmas Story, A (1983) | Genres: Children|Comedy | Predicted rating: 2.59
8. Lady and the Tramp (1955) | Genres: Animation|Children|Comedy|Romance | Predicted rating: 2.44
9. Snatch (2000) | Genres: Comedy|Crime|Thriller | Predicted rating: 2.40
10. Little Mermaid, The (1989) | Genres: Animation|Children|Comedy|Musical|Romance | Predicted rating: 2.38


## Evaluation with Precision at K

In [68]:
def precision_at_k(user_id, user_item_matrix, similarity_df, movies_df, method='user_based', k=10, threshold=3.5):
    """Calculate precision at K for a user"""
    # Split the user's ratings into train and test
    user_ratings = user_item_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0].index

    if len(rated_movies) < 20:  # Skip users with too few ratings
        return None

    # Split into train and test
    train_movies, test_movies = train_test_split(rated_movies, test_size=0.2, random_state=42)

    # Create a temporary user-item matrix with test ratings hidden
    temp_user_item = user_item_matrix.copy()
    temp_user_item.loc[user_id, test_movies] = 0

    # Generate recommendations based on the specified method
    if method == 'user_based':
        recommendations = recommend_movies_user_based(user_id, temp_user_item, similarity_df, movies_df, n_recommendations=k)
        recommended_movie_ids = [movies[movies['title'] == title]['movieId'].values[0] for title, _, _ in recommendations]
    elif method == 'item_based':
        item_similarity = calculate_item_similarity(temp_user_item)
        recommendations = recommend_movies_item_based(user_id, temp_user_item, item_similarity, movies_df, n_recommendations=k)
        recommended_movie_ids = [movies[movies['title'] == title]['movieId'].values[0] for title, _, _ in recommendations]
    elif method == 'svd':
        recommendations = svd_recommendations(temp_user_item, movies, user_id, n_recommendations=k)
        recommended_movie_ids = [movies[movies['title'] == title]['movieId'].values[0] for title, _, _ in recommendations]
    else:
        raise ValueError("Method must be 'user_based', 'item_based', or 'svd'")

    # Get actual high-rated movies from test set
    actual_high_rated = user_ratings[test_movies]
    actual_high_rated = actual_high_rated[actual_high_rated >= threshold].index

    # Calculate precision
    relevant_recommended = set(recommended_movie_ids) & set(actual_high_rated)
    precision = len(relevant_recommended) / k if k > 0 else 0

    return precision

def evaluate_precision_at_k(user_item_matrix, similarity_df, movies_df, method='user_based', k=10, n_users=30):
    """Evaluate precision at K for multiple users"""
    precisions = []
    user_ids = user_item_matrix.index.tolist()

    for user_id in user_ids[:n_users]:
        precision = precision_at_k(user_id, user_item_matrix, similarity_df, movies_df, method, k)
        if precision is not None:
            precisions.append(precision)

    avg_precision = np.mean(precisions) if precisions else 0
    return avg_precision

# Evaluate all methods
print("\nEvaluating recommendation methods...")
user_based_precision = evaluate_precision_at_k(user_item_matrix, user_similarity_df, movies, method='user_based')
item_based_precision = evaluate_precision_at_k(user_item_matrix, item_similarity_df, movies, method='item_based')
svd_precision = evaluate_precision_at_k(user_item_matrix, user_similarity_df, movies, method='svd')

print(f"\nAverage Precision at 10:")
print(f"User-based collaborative filtering: {user_based_precision:.4f}")
print(f"Item-based collaborative filtering: {item_based_precision:.4f}")
print(f"SVD matrix factorization: {svd_precision:.4f}")


Evaluating recommendation methods...

Average Precision at 10:
User-based collaborative filtering: 0.2800
Item-based collaborative filtering: 0.0633
SVD matrix factorization: 0.2333


## Hybrid Recommendation System

In [71]:
def hybrid_recommendations(user_id, user_item_matrix, user_similarity_df, item_similarity_df, movies_df, n_recommendations=10):
    """Generate hybrid recommendations combining user-based, item-based, and SVD approaches"""
    # Get recommendations from all three methods
    user_based_recs = recommend_movies_user_based(user_id, user_item_matrix, user_similarity_df, movies_df, n_recommendations*2)
    item_based_recs = recommend_movies_item_based(user_id, user_item_matrix, item_similarity_df, movies_df, n_recommendations*2)
    svd_recs = svd_recommendations(user_item_matrix, movies, user_id, n_recommendations*2)

    # Create a dictionary to store weighted scores
    movie_scores = {}

    # Weight the recommendations from different methods
    for i, (title, genres, rating) in enumerate(user_based_recs):
        movie_id = movies[movies['title'] == title]['movieId'].values[0]
        movie_scores[movie_id] = movie_scores.get(movie_id, 0) + rating * 0.4

    for i, (title, genres, rating) in enumerate(item_based_recs):
        movie_id = movies[movies['title'] == title]['movieId'].values[0]
        movie_scores[movie_id] = movie_scores.get(movie_id, 0) + rating * 0.3

    for i, (title, genres, rating) in enumerate(svd_recs):
        movie_id = movies[movies['title'] == title]['movieId'].values[0]
        movie_scores[movie_id] = movie_scores.get(movie_id, 0) + rating * 0.3

    # Get top N recommendations
    top_recommendations = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]

    # Get movie details
    recommendations_with_details = []
    for movie_id, score in top_recommendations:
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        if not movie_info.empty:
            movie_title = movie_info['title'].values[0]
            movie_genres = movie_info['genres'].values[0]
            recommendations_with_details.append((movie_title, movie_genres, score))

    return recommendations_with_details

# Test hybrid recommendations
hybrid_recommendations_list = hybrid_recommendations(sample_user_id, user_item_matrix, user_similarity_df, item_similarity_df, movies)
print(f"\nHybrid recommendations for user {sample_user_id}:")
for i, (title, genres, rating) in enumerate(hybrid_recommendations_list, 1):
    print(f"{i}. {title} | Genres: {genres} | Combined score: {rating:.2f}")


Hybrid recommendations for user 1:
1. Die Hard (1988) | Genres: Action|Crime|Thriller | Combined score: 2.81
2. Godfather, The (1972) | Genres: Crime|Drama | Combined score: 2.57
3. Godfather: Part II, The (1974) | Genres: Crime|Drama | Combined score: 2.43
4. Aliens (1986) | Genres: Action|Adventure|Horror|Sci-Fi | Combined score: 2.39
5. Breakfast Club, The (1985) | Genres: Comedy|Drama | Combined score: 2.33
6. Terminator 2: Judgment Day (1991) | Genres: Action|Sci-Fi | Combined score: 2.33
7. Jaws (1975) | Genres: Action|Horror | Combined score: 2.30
8. Hunt for Red October, The (1990) | Genres: Action|Adventure|Thriller | Combined score: 2.24
9. Lady and the Tramp (1955) | Genres: Animation|Children|Comedy|Romance | Combined score: 2.23
10. 101 Dalmatians (One Hundred and One Dalmatians) (1961) | Genres: Adventure|Animation|Children | Combined score: 2.19


## Complete Recommendation Function

In [87]:
def get_recommendations(user_id, method='hybrid', n_recommendations=10):
    """Get recommendations for a user using the specified method"""
    if method == 'user_based':
        return recommend_movies_user_based(user_id, user_item_matrix, user_similarity_df, movies, n_recommendations)
    elif method == 'item_based':
        return recommend_movies_item_based(user_id, user_item_matrix, item_similarity_df, movies, n_recommendations)
    elif method == 'svd':
        return svd_recommendations(user_item_matrix, movies, user_id, n_recommendations)
    elif method == 'hybrid':
        return hybrid_recommendations(user_id, user_item_matrix, user_similarity_df, item_similarity_df, movies, n_recommendations)
    else:
        raise ValueError("Method must be 'user_based', 'item_based', 'svd', or 'hybrid'")

# Example usage
user_id = 368
recommendations = get_recommendations(user_id, method='hybrid')
print(f"\nFinal recommendations for user {user_id}:")
for i, (title, genres, rating) in enumerate(recommendations, 1):
    print(f"{i}. {title} | Genres: {genres} | Score: {rating:.2f}")

# Test with another user
user_id = 369
recommendations = get_recommendations(user_id, method='hybrid')
print(f"\nFinal recommendations for user {user_id}:")
for i, (title, genres, rating) in enumerate(recommendations, 1):
    print(f"{i}. {title} | Genres: {genres} | Score: {rating:.2f}")


Final recommendations for user 368:
1. Monty Python and the Holy Grail (1975) | Genres: Adventure|Comedy|Fantasy | Score: 2.65
2. Blade Runner (1982) | Genres: Action|Sci-Fi|Thriller | Score: 2.13
3. Casablanca (1942) | Genres: Drama|Romance | Score: 2.05
4. Dances with Wolves (1990) | Genres: Adventure|Drama|Western | Score: 1.84
5. Clockwork Orange, A (1971) | Genres: Crime|Drama|Sci-Fi|Thriller | Score: 1.78
6. Star Wars: Episode I - The Phantom Menace (1999) | Genres: Action|Adventure|Sci-Fi | Score: 1.77
7. Contact (1997) | Genres: Drama|Sci-Fi | Score: 1.62
8. E.T. the Extra-Terrestrial (1982) | Genres: Children|Drama|Sci-Fi | Score: 1.58
9. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) | Genres: Comedy|War | Score: 1.52
10. Maltese Falcon, The (1941) | Genres: Film-Noir|Mystery | Score: 1.50

Final recommendations for user 369:
1. Pulp Fiction (1994) | Genres: Comedy|Crime|Drama|Thriller | Score: 2.14
2. American Beauty (1999) | Genres: Drama|Roman