In [46]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load data
movies_df = pd.read_csv('../data/movies.csv')
ratings_df = pd.read_csv('../data/ratings.csv')

# Convert the ratings table to a 2D matrix (users on one axis, movies on the other)
movie_user_matrix = ratings_df.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)

# Compute cosine similarity between items
cosine_sim = cosine_similarity(movie_user_matrix, movie_user_matrix)

# Convert to DataFrame for better handling
cosine_sim_df = pd.DataFrame(cosine_sim, index=movie_user_matrix.index, columns=movie_user_matrix.index)

# Function to make recommendations
def make_expanded_recommendations(movie_id, num_recommendations=20):
    # Get similarity scores for the given movie with all others
    sim_scores = cosine_sim_df[movie_id]
    
    # Sort the movies based on similarity scores
    sim_scores = sim_scores.sort_values(ascending=False)
    
    # Select a larger pool of potential recommendations
    larger_pool = sim_scores.iloc[1:num_recommendations*2].index.tolist()  # Select twice the number intended
    
    # Return the top-N from this larger pool
    final_recommendations = larger_pool[:num_recommendations]
    
    return final_recommendations

# Function to evaluate recommendations
def evaluate_recommendations(recommended_movies, actual_likes):
    recommended_set = set(recommended_movies)
    true_positives = len(recommended_set & actual_likes)
    false_positives = len(recommended_set - actual_likes)
    false_negatives = len(actual_likes - set(recommended_movies))
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return {'precision': precision, 'recall': recall, 'f1_score': f1_score}

# Define user's actual likes based on a threshold (e.g., rating >= 4)
def get_actual_likes(user_id, threshold=4):
    actual_likes = set(ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= threshold)]['movieId'].tolist())
    return actual_likes

# Example usage
user_id = 1  # Example user ID
top_rated_movie_id = ratings_df[ratings_df['userId'] == user_id].sort_values(by='rating', ascending=False).iloc[0]['movieId']
recommended_movies = make_expanded_recommendations(top_rated_movie_id, 10)
actual_likes = get_actual_likes(user_id)
evaluation_metrics = evaluate_recommendations(recommended_movies, actual_likes)

print("Top Rated Movie ID by User:", top_rated_movie_id)
print("Recommended Movie IDs:", recommended_movies)
print("Actual Likes:", list(actual_likes))
print("Evaluation Metrics:", evaluation_metrics)


Top Rated Movie ID by User: 5060.0
Recommended Movie IDs: [1304, 1179, 1299, 2110, 2944, 919, 1078, 1278, 1079, 1387]
Actual Likes: [1024, 1, 1025, 3, 2048, 1029, 6, 1031, 1032, 2054, 2058, 2571, 527, 1552, 1042, 2580, 1049, 2078, 543, 3617, 1060, 1573, 2596, 552, 553, 2090, 2094, 47, 2096, 1073, 50, 1587, 2099, 3639, 1080, 2105, 2616, 1089, 1090, 2115, 1092, 2116, 2628, 1097, 3147, 590, 592, 593, 1617, 2640, 596, 1620, 2641, 2644, 2648, 1625, 2137, 2139, 3671, 2141, 2654, 2143, 608, 3168, 101, 1127, 110, 1136, 2161, 3702, 3703, 2174, 2692, 2700, 2193, 3729, 661, 151, 2716, 157, 3740, 3744, 163, 1196, 1197, 1198, 3253, 1206, 1208, 1210, 1213, 1214, 1220, 1732, 1222, 1224, 2761, 1226, 3273, 3793, 216, 1240, 2268, 733, 2273, 3809, 231, 1256, 235, 2797, 1265, 1777, 2291, 1270, 1275, 1278, 1793, 1282, 260, 2826, 1291, 1804, 1805, 1298, 2329, 804, 2858, 2353, 2872, 3386, 2366, 1348, 333, 2387, 2899, 2395, 349, 356, 2916, 2406, 362, 367, 3439, 3440, 3441, 3448, 3450, 2427, 1920, 2944, 2947, 

Typical Accuracy Levels:

- General Accuracy: It's challenging to state a specific "average accuracy" because it depends highly on the context and the specific system configuration. However, good movie recommendation systems generally achieve:
    - RMSE: Values around 0.8 to 1.2 for rating predictions, with lower values indicating better accuracy.
    - Precision/Recall: Precision and recall can vary, but good systems might achieve over 20-30% precision in top-N recommendations in practical settings.
    - High-Performance Systems: In competitions like the Netflix Prize, the winning entries achieved RMSEs around 0.85, considered very high accuracy in a real-world system.

In [78]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load data
movies_df = pd.read_csv('../data/movies.csv')
ratings_df = pd.read_csv('../data/ratings.csv')

# Convert the ratings table to a 2D matrix (users on one axis, movies on the other)
movie_user_matrix = ratings_df.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)

# Convert dense matrix to sparse matrix format
sparse_movie_user_matrix = csr_matrix(movie_user_matrix.values)

# Compute cosine similarity between items using sparse matrix
cosine_sim_sparse = cosine_similarity(sparse_movie_user_matrix, dense_output=False)

# Convert sparse matrix of similarities to DataFrame for better handling
cosine_sim_df = pd.DataFrame.sparse.from_spmatrix(cosine_sim_sparse, index=movie_user_matrix.index, columns=movie_user_matrix.index)

# Function to make recommendations
def make_expanded_recommendations(movie_id, num_recommendations=20):
    # Get similarity scores for the given movie with all others
    sim_scores = cosine_sim_df.loc[movie_id]
    
    # Sort the movies based on similarity scores
    sim_scores = sim_scores.sort_values(ascending=False)
    
    # Select a larger pool of potential recommendations
    larger_pool = sim_scores.iloc[1:num_recommendations*2].index.tolist()  # Select twice the number intended
    
    # Return the top-N from this larger pool
    final_recommendations = larger_pool[:num_recommendations]
    
    return final_recommendations

# Function to evaluate recommendations
def evaluate_recommendations(recommended_movies, actual_likes):
    recommended_set = set(recommended_movies)
    true_positives = len(recommended_set & actual_likes)
    false_positives = len(recommended_set - actual_likes)
    false_negatives = len(actual_likes - set(recommended_movies))
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return {'precision': precision, 'recall': recall, 'f1_score': f1_score}

# Define user's actual likes based on a threshold (e.g., rating >= 4)
def get_actual_likes(user_id, threshold=4):
    actual_likes = set(ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= threshold)]['movieId'].tolist())
    return actual_likes

# Example usage
user_id = 1  # Example user ID
top_rated_movie_id = ratings_df[ratings_df['userId'] == user_id].sort_values(by='rating', ascending=False).iloc[0]['movieId']
recommended_movies = make_expanded_recommendations(top_rated_movie_id, 10)
actual_likes = get_actual_likes(user_id)
evaluation_metrics = evaluate_recommendations(recommended_movies, actual_likes)

print("Top Rated Movie ID by User:", top_rated_movie_id)
print("Recommended Movie IDs:", recommended_movies)
print("Actual Likes:", list(actual_likes))
print("Evaluation Metrics:", evaluation_metrics)


Top Rated Movie ID by User: 5060.0
Recommended Movie IDs: [1304, 1179, 1299, 2110, 2944, 919, 1078, 1278, 1079, 1387]
Actual Likes: [1024, 1, 1025, 3, 2048, 1029, 6, 1031, 1032, 2054, 2058, 2571, 527, 1552, 1042, 2580, 1049, 2078, 543, 3617, 1060, 1573, 2596, 552, 553, 2090, 2094, 47, 2096, 1073, 50, 1587, 2099, 3639, 1080, 2105, 2616, 1089, 1090, 2115, 1092, 2116, 2628, 1097, 3147, 590, 592, 593, 1617, 2640, 596, 1620, 2641, 2644, 2648, 1625, 2137, 2139, 3671, 2141, 2654, 2143, 608, 3168, 101, 1127, 110, 1136, 2161, 3702, 3703, 2174, 2692, 2700, 2193, 3729, 661, 151, 2716, 157, 3740, 3744, 163, 1196, 1197, 1198, 3253, 1206, 1208, 1210, 1213, 1214, 1220, 1732, 1222, 1224, 2761, 1226, 3273, 3793, 216, 1240, 2268, 733, 2273, 3809, 231, 1256, 235, 2797, 1265, 1777, 2291, 1270, 1275, 1278, 1793, 1282, 260, 2826, 1291, 1804, 1805, 1298, 2329, 804, 2858, 2353, 2872, 3386, 2366, 1348, 333, 2387, 2899, 2395, 349, 356, 2916, 2406, 362, 367, 3439, 3440, 3441, 3448, 3450, 2427, 1920, 2944, 2947, 

In [82]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load data
movies_df = pd.read_csv('../data/movies.csv')
ratings_df = pd.read_csv('../data/ratings.csv')

# Convert the ratings table to a 2D matrix (users on one axis, movies on the other)
movie_user_matrix = ratings_df.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)

# Convert dense matrix to sparse matrix format
sparse_movie_user_matrix = csr_matrix(movie_user_matrix.values)

# Compute cosine similarity between items using sparse matrix
cosine_sim_sparse = cosine_similarity(sparse_movie_user_matrix, dense_output=False)

# Convert sparse matrix of similarities to DataFrame for better handling
cosine_sim_df = pd.DataFrame.sparse.from_spmatrix(cosine_sim_sparse, index=movie_user_matrix.index, columns=movie_user_matrix.index)

# Calculate genre-based similarity matrix
def get_genre_matrix():
    # Convert genre strings into a binary matrix
    genres_matrix = movies_df['genres'].str.get_dummies(sep='|')
    return csr_matrix(genres_matrix.values)

genre_matrix = get_genre_matrix()
genre_similarity = cosine_similarity(genre_matrix, dense_output=False)
genre_sim_df = pd.DataFrame.sparse.from_spmatrix(genre_similarity, index=movies_df['movieId'].values, columns=movies_df['movieId'].values)

def hybrid_recommendations(movie_id, num_recommendations=20, weight_factor=0.7):
    rating_sim_scores = cosine_sim_df.loc[movie_id]
    genre_sim_scores = genre_sim_df.loc[movie_id]

    # Adjust weights based on the number of ratings a movie has
    num_ratings = movie_user_matrix.loc[movie_id].count()
    weight = weight_factor if num_ratings > 10 else 1 - weight_factor  # More weight on genre if fewer ratings

    hybrid_scores = weight * rating_sim_scores + (1 - weight) * genre_sim_scores
    hybrid_scores = hybrid_scores.sort_values(ascending=False)

    return hybrid_scores.iloc[1:num_recommendations+1].index.tolist()

# Function to evaluate recommendations
def evaluate_recommendations(recommended_movies, actual_likes):
    recommended_set = set(recommended_movies)
    true_positives = len(recommended_set & actual_likes)
    false_positives = len(recommended_set - actual_likes)
    false_negatives = len(actual_likes - set(recommended_movies))

    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return {'precision': precision, 'recall': recall, 'f1_score': f1_score}

# Define user's actual likes based on a threshold (e.g., rating >= 4)
def get_actual_likes(user_id, threshold=4):
    actual_likes = set(ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= threshold)]['movieId'].tolist())
    return actual_likes

# Example usage
user_id = 1  # Example user ID
top_rated_movie_id = ratings_df[ratings_df['userId'] == user_id].sort_values(by='rating', ascending=False).iloc[0]['movieId']
recommended_movies = hybrid_recommendations(top_rated_movie_id, 10)
actual_likes = get_actual_likes(user_id)
evaluation_metrics = evaluate_recommendations(recommended_movies, actual_likes)

print("Top Rated Movie ID by User:", top_rated_movie_id)
print("Recommended Movie IDs:", recommended_movies)
print("Actual Likes:", list(actual_likes))
print("Evaluation Metrics:", evaluation_metrics)


Top Rated Movie ID by User: 5060.0
Recommended Movie IDs: [1299, 1078, 3448, 1090, 1272, 2944, 1263, 1292, 2973, 2289]
Actual Likes: [1024, 1, 1025, 3, 2048, 1029, 6, 1031, 1032, 2054, 2058, 2571, 527, 1552, 1042, 2580, 1049, 2078, 543, 3617, 1060, 1573, 2596, 552, 553, 2090, 2094, 47, 2096, 1073, 50, 1587, 2099, 3639, 1080, 2105, 2616, 1089, 1090, 2115, 1092, 2116, 2628, 1097, 3147, 590, 592, 593, 1617, 2640, 596, 1620, 2641, 2644, 2648, 1625, 2137, 2139, 3671, 2141, 2654, 2143, 608, 3168, 101, 1127, 110, 1136, 2161, 3702, 3703, 2174, 2692, 2700, 2193, 3729, 661, 151, 2716, 157, 3740, 3744, 163, 1196, 1197, 1198, 3253, 1206, 1208, 1210, 1213, 1214, 1220, 1732, 1222, 1224, 2761, 1226, 3273, 3793, 216, 1240, 2268, 733, 2273, 3809, 231, 1256, 235, 2797, 1265, 1777, 2291, 1270, 1275, 1278, 1793, 1282, 260, 2826, 1291, 1804, 1805, 1298, 2329, 804, 2858, 2353, 2872, 3386, 2366, 1348, 333, 2387, 2899, 2395, 349, 356, 2916, 2406, 362, 367, 3439, 3440, 3441, 3448, 3450, 2427, 1920, 2944, 2947,

NameError: name 'python' is not defined

In [84]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

# Load data into surprise library format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Define SVD algorithm
algo = SVD()

# Perform cross-validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train on full dataset
trainset = data.build_full_trainset()
algo.fit(trainset)

# Predict a rating for a user and item
user_id = 1
movie_id = 2
predicted_rating = algo.predict(user_id, movie_id).est
print("Predicted Rating for user:", user_id, "item:", movie_id, "is", predicted_rating)


ModuleNotFoundError: No module named 'surprise'