In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [2]:
ratings = pd.read_csv('ratings.csv')  # userId,movieId,rating,timestamp
movies = pd.read_csv('movies.csv')    # movieId,title,genres
tags = pd.read_csv('tags.csv')   

In [3]:
# Split genres into lists
movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])

# Merge ratings and movies
data = pd.merge(ratings, movies, on='movieId')


In [4]:
# 4. Collaborative Filtering (Matrix Factorization)
user_movie_matrix = data.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
svd = TruncatedSVD(n_components=20, random_state=42)
latent_matrix = svd.fit_transform(user_movie_matrix)
reconstructed_ratings = np.dot(latent_matrix, svd.components_)
reconstructed_df = pd.DataFrame(reconstructed_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)

def collaborative_recommend(user_id, n=10):
    user_row = reconstructed_df.loc[user_id]
    user_rated = user_movie_matrix.loc[user_id]
    unseen = user_rated[user_rated == 0].index
    recs = user_row[unseen].sort_values(ascending=False).head(n)
    return recs.index.tolist()


In [5]:
# 5. Content-Based Filtering (Genres)
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=movies['movieId'])

def content_recommend(user_id, n=10):
    user_ratings = data[data['userId'] == user_id][['movieId', 'rating']]
    top_movies = user_ratings.sort_values(by='rating', ascending=False).head(5)['movieId']
    user_profile = genre_df.loc[top_movies].mean(axis=0).values.reshape(1, -1)
    similarities = cosine_similarity(user_profile, genre_df)[0]
    sim_scores = pd.Series(similarities, index=genre_df.index)
    already_rated = user_ratings['movieId'].tolist()
    recs = sim_scores.drop(already_rated).sort_values(ascending=False).head(n)
    return recs.index.tolist()


In [6]:
# 6. Sentiment-Based Filtering (using tags as pseudo-reviews)
# Prepare pseudo-reviews by grouping tags per movie
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies_with_tags = pd.merge(movies, movie_tags, on='movieId', how='left').fillna('')

# Dummy sentiment labels: positive if tag contains 'good', negative if 'bad' (for demo)
movies_with_tags['sentiment'] = movies_with_tags['tag'].apply(
    lambda x: 1 if 'good' in x.lower() else (0 if 'bad' in x.lower() else 1)
)

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(movies_with_tags['tag'])
y = movies_with_tags['sentiment']
clf = MultinomialNB()
clf.fit(X, y)

def sentiment_score(movie_ids):
    subset = movies_with_tags[movies_with_tags['movieId'].isin(movie_ids)]
    if subset.empty:
        return pd.Series(1, index=movie_ids)
    X_test = vectorizer.transform(subset['tag'])
    preds = clf.predict_proba(X_test)[:, 1]
    sentiment_scores = pd.Series(preds, index=subset['movieId'])
    return sentiment_scores.reindex(movie_ids, fill_value=sentiment_scores.mean())


In [7]:
# 7. Hybrid Recommendation Function
def hybrid_recommend(user_id, n=5):
    collab_recs = collaborative_recommend(user_id, n=20)
    content_recs = content_recommend(user_id, n=20)
    combined = list(set(collab_recs) & set(content_recs))
    combined += [m for m in collab_recs + content_recs if m not in combined]
    combined = combined[:20]
    sentiments = sentiment_score(combined)
    top = sentiments.sort_values(ascending=False).head(n).index
    return movies[movies['movieId'].isin(top)][['movieId', 'title', 'genres']]


In [8]:
# 8. Example Usage: Get Top 5 Recommendations for a User
user_id = 1
recommendations = hybrid_recommend(user_id, n=5)
print("Top 5 Recommendations for User", user_id)
print(recommendations)


Top 5 Recommendations for User 1
      movieId                                      title  \
31         32  Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   
474       541                        Blade Runner (1982)   
706       924               2001: A Space Odyssey (1968)   
902      1200                              Aliens (1986)   
2078     2762                    Sixth Sense, The (1999)   

                                   genres  
31            [Mystery, Sci-Fi, Thriller]  
474            [Action, Sci-Fi, Thriller]  
706            [Adventure, Drama, Sci-Fi]  
902   [Action, Adventure, Horror, Sci-Fi]  
2078             [Drama, Horror, Mystery]  


In [9]:

# 9. (Optional) Display as a Table
import IPython.display as display
display.display(recommendations)

Unnamed: 0,movieId,title,genres
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),"[Mystery, Sci-Fi, Thriller]"
474,541,Blade Runner (1982),"[Action, Sci-Fi, Thriller]"
706,924,2001: A Space Odyssey (1968),"[Adventure, Drama, Sci-Fi]"
902,1200,Aliens (1986),"[Action, Adventure, Horror, Sci-Fi]"
2078,2762,"Sixth Sense, The (1999)","[Drama, Horror, Mystery]"
