In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Loading ratings
ratings = pd.read_csv(
    "u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

# Loading movie metadata with genres
movie_columns = [
    "movie_id", "title", "release_date", "video_release_date",
    "IMDb_URL", "unknown", "Action", "Adventure", "Animation",
    "Children", "Comedy", "Crime", "Documentary", "Drama",
    "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery",
    "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies = pd.read_csv(
    "u.item",
    sep="|",
    encoding="latin-1",
    names=movie_columns
)

print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)


Ratings shape: (100000, 4)
Movies shape: (1682, 24)


In [3]:
# Creating user-item matrix
user_movie_matrix = ratings.pivot_table(
    index="user_id",
    columns="movie_id",
    values="rating"
)

print(user_movie_matrix.shape)
user_movie_matrix.head()


(943, 1682)


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [4]:
user_movie_matrix = user_movie_matrix.fillna(0)


In [5]:
user_similarity = cosine_similarity(user_movie_matrix)

user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_movie_matrix.index,
    columns=user_movie_matrix.index
)

print(user_similarity_df.shape)


(943, 943)


In [6]:
def recommend_movies(user_id, num_recommendations=5):
    
    # Get similarity scores
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    
    # Remove self
    similar_users = similar_users.drop(user_id)
    
    # Get top similar users
    top_users = similar_users.head(10).index
    
    # Get movies rated by similar users
    recommended_movies = user_movie_matrix.loc[top_users].mean().sort_values(ascending=False)
    
    # Remove movies already rated by the user
    user_rated = user_movie_matrix.loc[user_id]
    recommended_movies = recommended_movies[user_rated == 0]
    
    # Get top recommendations
    top_movie_ids = recommended_movies.head(num_recommendations).index
    
    return movies[movies["movie_id"].isin(top_movie_ids)][["title"]]


In [7]:
recommend_movies(user_id=1, num_recommendations=5)


Unnamed: 0,title
317,Schindler's List (1993)
356,One Flew Over the Cuckoo's Nest (1975)
422,E.T. the Extra-Terrestrial (1982)
473,Dr. Strangelove or: How I Learned to Stop Worr...
654,Stand by Me (1986)


In [8]:
# Select only genre columns
genre_columns = [
    "unknown", "Action", "Adventure", "Animation",
    "Children", "Comedy", "Crime", "Documentary",
    "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]

genre_matrix = movies[genre_columns]

print(genre_matrix.shape)


(1682, 19)


In [9]:
movie_similarity = cosine_similarity(genre_matrix)

movie_similarity_df = pd.DataFrame(
    movie_similarity,
    index=movies["movie_id"],
    columns=movies["movie_id"]
)

print(movie_similarity_df.shape)


(1682, 1682)


In [10]:
def recommend_by_genre(movie_id, num_recommendations=5):
    
    similar_movies = movie_similarity_df[movie_id].sort_values(ascending=False)
    
    # Remove itself
    similar_movies = similar_movies.drop(movie_id)
    
    top_movie_ids = similar_movies.head(num_recommendations).index
    
    return movies[movies["movie_id"].isin(top_movie_ids)][["title"]]


In [11]:
recommend_by_genre(movie_id=1)


Unnamed: 0,title
94,Aladdin (1992)
242,Jungle2Jungle (1997)
421,Aladdin and the King of Thieves (1996)
622,Angels in the Outfield (1994)
1218,"Goofy Movie, A (1995)"


In [12]:
imdb = pd.read_csv("imdb.csv")

print(imdb.shape)
imdb.head()


(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
imdb["sentiment_score"] = imdb["sentiment"].map({
    "positive": 1,
    "negative": 0
})

imdb.head()


Unnamed: 0,review,sentiment,sentiment_score
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [14]:
average_sentiment = imdb["sentiment_score"].mean()

print("Average Sentiment Score:", average_sentiment)


Average Sentiment Score: 0.5


In [15]:
def hybrid_recommend(user_id, num_recommendations=5):
    
    # Step 1: Collaborative Filtering
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    similar_users = similar_users.drop(user_id)
    top_users = similar_users.head(10).index
    
    recommended_movies = user_movie_matrix.loc[top_users].mean()
    
    user_rated = user_movie_matrix.loc[user_id]
    recommended_movies = recommended_movies[user_rated == 0]
    
    # Step 2: Add Sentiment Boost
    sentiment_weight = average_sentiment
    recommended_movies = recommended_movies * (1 + sentiment_weight)
    
    top_movie_ids = recommended_movies.sort_values(ascending=False).head(num_recommendations).index
    
    return movies[movies["movie_id"].isin(top_movie_ids)][["title"]]


In [16]:
hybrid_recommend(user_id=1)


Unnamed: 0,title
317,Schindler's List (1993)
356,One Flew Over the Cuckoo's Nest (1975)
422,E.T. the Extra-Terrestrial (1982)
473,Dr. Strangelove or: How I Learned to Stop Worr...
654,Stand by Me (1986)
