In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
# Load ratings and movies data
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
# Load movie links data
movie_links = pd.read_csv('ml-latest-small/links_with_url.csv')
# Load tags data
tags = pd.read_csv('ml-latest-small/tags.csv')

# Collaborative Filtering Pre processing logic

# Group ratings by movieId and calculate count and mean
movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])

# Calculate Bayesian average
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()
movie_stats['bayesian_avg'] = (C * m + movie_stats['count'] * movie_stats['mean']) / (C + movie_stats['count'])

# Define minimum count and Bayesian average threshold
min_C = 30
min_m = 3.5

# Filter movies based on thresholds
filtered_movies = movie_stats[(movie_stats['count'] >= min_C) & (movie_stats['bayesian_avg'] >= min_m)]
filtered_movielist = filtered_movies.index

# Get active users with at least 150 ratings
active_users = ratings.groupby('userId')['rating'].count()
active_users = active_users[active_users >= 150]
active_userlist = active_users.index

# Filter ratings for active users and filtered movies
ratings = ratings[ratings['userId'].isin(active_userlist) & ratings['movieId'].isin(filtered_movielist)]

# Merge ratings with movies data
final_ratings = ratings.merge(movies[['movieId', 'title']])

# Pivot table to get a user-movie matrix
pivot_ratings = final_ratings.pivot_table(index='userId', columns='title', values='rating').fillna(0)

# Compute cosine similarity between users
similarity_scores = cosine_similarity(pivot_ratings)


# Content Based Filtering Pre processing logic

# Splitting a single genre column into multiple columns
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

movie_tags = tags.groupby('movieId')['tag'].agg(list).reset_index()
movies_final = movies.merge(movie_tags, on='movieId', how='left')

# Replace NaN with blank
movies_final['tag'] = movies_final['tag'].fillna('')
movies_final['genres'] = movies_final['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies_final['tag'] = movies_final['tag'].apply(lambda x: [i.replace(" ", "") for i in x])

# Extract the release year from the movie title
pattern = r'\((\d{4})\)$'
movies_final['release-year'] = movies_final['title'].str.extract(pattern)

# Convert release year to array format
movies_final['release-year'] = movies_final['release-year'].apply(lambda year: [year])

# Create final tag combining genres, tags, and release year
movies_final['final_tag'] = movies_final['genres'] + movies_final['tag'] + movies_final['release-year']

# Create pre link and post link dfs to merge the url links
movies_prelink_df = movies_final[['movieId', 'title', 'final_tag']]
movies_postlink_df = movies_prelink_df.merge(movie_links, on='movieId')

# Final data frame to be used in the recommendations
movies_df = movies_postlink_df[['movieId', 'title', 'final_tag', 'cover_url']]

# Convert the 'final_tag' column from list format to string format
movies_df['final_tag'] = movies_df['final_tag'].apply(lambda x: ' '.join(map(str, x)))

# Stemming function
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

# Stemming final tags
movies_df['final_tag'] = movies_df['final_tag'].apply(stem)

# CountVectorizer
cv = CountVectorizer(max_features=1600, stop_words='english')
vectors = cv.fit_transform(movies_df['final_tag']).toarray()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['final_tag'] = movies_df['final_tag'].apply(lambda x: ' '.join(map(str, x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['final_tag'] = movies_df['final_tag'].apply(stem)


In [5]:
def recommend_movies_collab(user_id, num_recommendations=5):
    # Find similar users
    similar_users = np.argsort(similarity_scores[user_id])[::-1][1:]  # Exclude the user itself
    
    # Get movies rated by the user
    user_movies = pivot_ratings.iloc[user_id][pivot_ratings.iloc[user_id] != 0].index
    
    # Initialize dictionary to store movie recommendations
    recommended_movies = {}
    
    # Iterate over similar users
    for similar_user in similar_users:
        # Get movies rated by the similar user
        similar_user_movies = pivot_ratings.iloc[similar_user][pivot_ratings.iloc[similar_user] != 0].index
        
        # Exclude movies already rated by the user
        new_movies = np.setdiff1d(similar_user_movies, user_movies)
        
        # Add new movies to recommendations
        for movie in new_movies:
            if movie in recommended_movies:
                recommended_movies[movie] += similarity_scores[user_id][similar_user]  # Use similarity score here
            else:
                recommended_movies[movie] = similarity_scores[user_id][similar_user]  # Use similarity score here
    
    
    # Sort recommended movies by similarity score
    sorted_recommendations = sorted(recommended_movies.items(), key=lambda x: x[1], reverse=True)
    
    # Get movie titles from sorted recommendations
    movie_titles = [title for title, _ in sorted_recommendations[:num_recommendations]]

    # Filter movies from movies_df based on titles
    recommended_movies = movies_df[movies_df['title'].isin(movie_titles)]
    return recommended_movies

In [6]:
def recommend_movies_content(user_id, num_recommendations=5):
    # Find movies rated by the user
    user_movies = ratings[ratings['userId'] == user_id].merge(movies_df, on='movieId', how='left')
    
    user_vectors = cv.transform(user_movies['final_tag']).toarray()
    
    # Compute cosine similarity between user's movies and all movies
    user_similarity  = cosine_similarity(user_vectors, vectors)

    # Get average similarity score for each movie rated by the user
    avg_similarity = np.mean(user_similarity, axis=0)
    
    # Get indices of movies sorted by similarity score
    sorted_indices = np.argsort(avg_similarity)[::-1]

    # Get movie titles from sorted indices
    movie_titles = movies_df.iloc[sorted_indices[:num_recommendations]]['title']

    # Use movie_titles to create a boolean mask
    mask = movies_df['title'].isin(movie_titles)

    # Apply the mask to movies_df to get recommended movies
    recommended_movies = movies_df[mask]

    return recommended_movies

In [7]:
user_id = 1
content_slider = 0.4
user_similarity_slider = 0.6
exclude_watched = True
num_recommendations = 10

num_recommendations_collab = int(content_slider * num_recommendations)
num_recommendations_content = int(user_similarity_slider * num_recommendations)

In [8]:
recommended_movies_collab = recommend_movies_collab(user_id, num_recommendations_collab)

In [9]:
recommended_movies_collab

Unnamed: 0,movieId,title,final_tag,cover_url
277,318,"Shawshank Redemption, The (1994)",crime drama prison stephenk wrongfulimprison m...,https://m.media-amazon.com/images/M/MV5BNDE3OD...
314,356,Forrest Gump (1994),comedi drama romanc war shrimp vietnam bubbagu...,https://m.media-amazon.com/images/M/MV5BNWIwOD...
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),action adventur sci-fi darthvad lukeskywalk sp...,https://m.media-amazon.com/images/M/MV5BOWZlMj...
969,1270,Back to the Future (1985),adventur comedi sci-fi timetravel 1985,https://m.media-amazon.com/images/M/MV5BZmU0M2...


In [10]:
recommended_movies_content = recommend_movies_content(user_id, num_recommendations_content)

In [17]:
recommended_movies_content

Unnamed: 0,movieId,title,final_tag,cover_url
19,20,Money Train (1995),action comedi crime drama thriller 1995,https://m.media-amazon.com/images/M/MV5BNjc3Yj...
118,145,Bad Boys (1995),action comedi crime drama thriller 1995,https://m.media-amazon.com/images/M/MV5BMGE1ZT...
4005,5657,Flashback (1990),action adventur comedi crime drama 1990,https://m.media-amazon.com/images/M/MV5BMTk0OT...
4681,6990,The Great Train Robbery (1978),action adventur comedi crime drama 1978,https://m.media-amazon.com/images/M/MV5BOWE4M2...
6570,55116,"Hunting Party, The (2007)",action adventur comedi drama thriller 2007,https://m.media-amazon.com/images/M/MV5BNTQ0OT...
8597,117646,Dragonheart 2: A New Beginning (2000),action adventur comedi drama fantasi thriller ...,https://m.media-amazon.com/images/M/MV5BMjAxND...


In [18]:
recommended_movies = pd.concat([recommended_movies_collab, recommended_movies_content], ignore_index=True)

In [19]:
recommended_movies

Unnamed: 0,movieId,title,final_tag,cover_url
0,318,"Shawshank Redemption, The (1994)",crime drama prison stephenk wrongfulimprison m...,https://m.media-amazon.com/images/M/MV5BNDE3OD...
1,356,Forrest Gump (1994),comedi drama romanc war shrimp vietnam bubbagu...,https://m.media-amazon.com/images/M/MV5BNWIwOD...
2,1210,Star Wars: Episode VI - Return of the Jedi (1983),action adventur sci-fi darthvad lukeskywalk sp...,https://m.media-amazon.com/images/M/MV5BOWZlMj...
3,1270,Back to the Future (1985),adventur comedi sci-fi timetravel 1985,https://m.media-amazon.com/images/M/MV5BZmU0M2...
4,20,Money Train (1995),action comedi crime drama thriller 1995,https://m.media-amazon.com/images/M/MV5BNjc3Yj...
5,145,Bad Boys (1995),action comedi crime drama thriller 1995,https://m.media-amazon.com/images/M/MV5BMGE1ZT...
6,5657,Flashback (1990),action adventur comedi crime drama 1990,https://m.media-amazon.com/images/M/MV5BMTk0OT...
7,6990,The Great Train Robbery (1978),action adventur comedi crime drama 1978,https://m.media-amazon.com/images/M/MV5BOWE4M2...
8,55116,"Hunting Party, The (2007)",action adventur comedi drama thriller 2007,https://m.media-amazon.com/images/M/MV5BNTQ0OT...
9,117646,Dragonheart 2: A New Beginning (2000),action adventur comedi drama fantasi thriller ...,https://m.media-amazon.com/images/M/MV5BMjAxND...


In [16]:
watched_movies = ratings[ratings['userId'] == user_id][['movieId', 'title']].to_dict(orient='records')

KeyError: "['title'] not in index"

In [25]:
user_ratings =  ratings[ratings['userId'] == user_id]['movieId']
# Use movie_titles to create a boolean mask
watch_mask = movies_df['movieId'].isin(user_ratings)

# Apply the mask to movies_df to get recommended movies
watched_movies = movies_df[watch_mask]
watched_movies

Unnamed: 0,movieId,title,final_tag,cover_url
0,1,Toy Story (1995),adventur anim children comedi fantasi pixar pi...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
5,6,Heat (1995),action crime thriller 1995,https://m.media-amazon.com/images/M/MV5BYjZjNT...
43,47,Seven (a.k.a. Se7en) (1995),mysteri thriller mysteri twistend serialkil 1995,https://m.media-amazon.com/images/M/MV5BOTUwOD...
46,50,"Usual Suspects, The (1995)",crime mysteri thriller mindfuck suspens thrill...,https://m.media-amazon.com/images/M/MV5BYTViNj...
97,110,Braveheart (1995),action drama war beautifulsceneri epic histor ...,https://m.media-amazon.com/images/M/MV5BMzkzMm...
...,...,...,...,...
2674,3578,Gladiator (2000),action adventur drama ancientrom epic histori ...,https://m.media-amazon.com/images/M/MV5BMDliMm...
2733,3671,Blazing Saddles (1974),comedi western darkhumor easygo silli 1974,https://m.media-amazon.com/images/M/MV5BZGZmMW...
2765,3703,"Road Warrior, The (Mad Max 2) (1981)",action adventur sci-fi thriller 1981,https://m.media-amazon.com/images/M/MV5BN2VlNj...
2836,3793,X-Men (2000),action adventur sci-fi action comicbook hughja...,https://m.media-amazon.com/images/M/MV5BZmIyMD...
