In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
movie = pd.read_csv("/Users/tony/Documents/research_projects/uneeq_interns/ml-25m/movies.csv")
rating = pd.read_csv("/Users/tony/Documents/research_projects/uneeq_interns/ml-25m/ratings.csv")

# Introduction

In this project, I built a simple movie recommendation system using two methods: popularity-based filtering and collaborative filtering.

**Collaborative filtering** makes personalized recommendations by looking at users with similar tastes. It finds patterns in user behavior to suggest movies that a person might like based on what others with similar interests have watched.

**Popularity-based filtering** recommends movies that are highly rated or watched by many users. This method doesn’t consider individual preferences, but it’s a good starting point, especially for new users.


In [5]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [6]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [7]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
rating.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [9]:
movie.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
#remove year from the title
import re

def change_title(title):
    if (len(title)>7):
        title= title[:-7]
    return title
movie["title"] = movie["title"].apply(change_title)
movie.title


0                          Toy Story
1                            Jumanji
2                   Grumpier Old Men
3                  Waiting to Exhale
4        Father of the Bride Part II
                    ...             
62418                             We
62419             Window of the Soul
62420                      Bad Poems
62421                   A Girl Thing
62422        Women of Devil's Island
Name: title, Length: 62423, dtype: object

In [11]:
def new_title(title):
    title = re.sub("[^a-zA-Z0-9 ]","",title)
    return title

movie["new_title"] = movie["title"].apply(new_title)
movie["new_title"] = movie["new_title"].str.lower()

In [12]:
movie.head()

Unnamed: 0,movieId,title,genres,new_title
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story
1,2,Jumanji,Adventure|Children|Fantasy,jumanji
2,3,Grumpier Old Men,Comedy|Romance,grumpier old men
3,4,Waiting to Exhale,Comedy|Drama|Romance,waiting to exhale
4,5,Father of the Bride Part II,Comedy,father of the bride part ii


In [13]:
#merging of dataset
movie_merge = movie.merge(rating,on='movieId')

In [14]:
movie_merge.head()

Unnamed: 0,movieId,title,genres,new_title,userId,rating,timestamp
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,2,3.5,1141415820
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,3,4.0,1439472215
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,4,3.0,1573944252
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,5,4.0,858625949
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,8,4.0,890492517


In [15]:
#grouping based on tha rating number
nm_rating = movie_merge.groupby('title').count()['rating'].reset_index()
nm_rating.head()

Unnamed: 0,title,rating
0,"""BLOW THE NIGHT!"" Let's Spend the Night Together",1
1,"""Great Performances"" Cats",179
2,#1 Cheerleader Camp,9
3,#Captured,2
4,#Female Pleasure,3


In [16]:
nm_rating.rename(columns = {'rating':'nm_of_rating'},inplace=True)
nm_rating.head()

Unnamed: 0,title,nm_of_rating
0,"""BLOW THE NIGHT!"" Let's Spend the Night Together",1
1,"""Great Performances"" Cats",179
2,#1 Cheerleader Camp,9
3,#Captured,2
4,#Female Pleasure,3


# collaborative based filtering

In [17]:
#group by userids who has given ratings more than 1500 movies
x = movie_merge.groupby("userId").count()['rating']>1500
x.head()

userId
1    False
2    False
3    False
4    False
5    False
Name: rating, dtype: bool

In [18]:
#index for users who has given ratings more than 1k
sim_users = x[x].index
sim_users

Index([   548,    626,    847,    997,   1401,   1652,   1748,   1920,   1977,
         2165,
       ...
       160922, 160951, 161184, 161383, 161544, 161586, 161928, 162047, 162271,
       162516],
      dtype='int64', name='userId', length=910)

In [19]:
#movies viewed by sim_users
sim_movies = movie_merge[movie_merge['userId'].isin(sim_users)]
sim_movies

Unnamed: 0,movieId,title,genres,new_title,userId,rating,timestamp
172,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,548,4.5,1431644949
199,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,626,4.5,1136304145
283,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,847,4.0,1048092664
339,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,997,4.5,1529250285
477,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,1401,4.5,1544250123
...,...,...,...,...,...,...,...
25000061,209067,Sousse: Marché aux charbons (avec chameaux),(no genres listed),sousse march aux charbons avec chameaux,154484,2.5,1574026518
25000068,209089,An Impossible Balancing Feat,(no genres listed),an impossible balancing feat,154484,3.0,1574075308
25000070,209103,Tsar Ivan the Terrible,(no genres listed),tsar ivan the terrible,13737,4.0,1574112239
25000079,209135,Jane B. by Agnès V.,Documentary|Fantasy,jane b by agns v,154484,3.5,1574187267


In [20]:
#title for movies which have ratings more than 500
y = sim_movies.groupby('title').count()['rating']>500 

fam_movies = y[y].index
fam_movies

Index(['(500) Days of Summer', '10 Things I Hate About You',
       '101 Dalmatians (One Hundred and One Dalmatians)', '12 Angry Men',
       '2001: A Space Odyssey', '21 Grams', '28 Days Later', '300',
       '3:10 to Yuma', '40-Year-Old Virgin, The',
       ...
       'X-Men', 'X-Men: First Class', 'X-Men: The Last Stand',
       'X2: X-Men United', 'You've Got Mail', 'Young Frankenstein', 'Zodiac',
       'Zombieland', 'Zoolander', 'xXx'],
      dtype='object', name='title', length=791)

In [21]:
#details of the famous movie
fin_ratings=sim_movies[movie_merge['title'].isin(fam_movies)]
fin_ratings

  fin_ratings=sim_movies[movie_merge['title'].isin(fam_movies)]


Unnamed: 0,movieId,title,genres,new_title,userId,rating,timestamp
172,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,548,4.5,1431644949
199,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,626,4.5,1136304145
283,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,847,4.0,1048092664
339,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,997,4.5,1529250285
477,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,1401,4.5,1544250123
...,...,...,...,...,...,...,...
24997994,205873,Cinderella,Animation|Children|Fantasy,cinderella,72315,3.0,1567666207
24998980,206899,Charlie's Angels,Action|Adventure|Comedy,charlies angels,132358,2.0,1573872003
24998982,206899,Charlie's Angels,Action|Adventure|Comedy,charlies angels,143568,3.5,1574300863
24998984,206899,Charlie's Angels,Action|Adventure|Comedy,charlies angels,146708,3.0,1574282669


In [22]:
fin_ratings.drop_duplicates()

Unnamed: 0,movieId,title,genres,new_title,userId,rating,timestamp
172,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,548,4.5,1431644949
199,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,626,4.5,1136304145
283,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,847,4.0,1048092664
339,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,997,4.5,1529250285
477,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,toy story,1401,4.5,1544250123
...,...,...,...,...,...,...,...
24997994,205873,Cinderella,Animation|Children|Fantasy,cinderella,72315,3.0,1567666207
24998980,206899,Charlie's Angels,Action|Adventure|Comedy,charlies angels,132358,2.0,1573872003
24998982,206899,Charlie's Angels,Action|Adventure|Comedy,charlies angels,143568,3.5,1574300863
24998984,206899,Charlie's Angels,Action|Adventure|Comedy,charlies angels,146708,3.0,1574282669


In [23]:
df = fin_ratings.pivot_table(index='new_title',columns='userId',values='rating')
df1 = fin_ratings.pivot_table(index=['new_title','title'],columns='userId',values='rating')


In [24]:
df.fillna(0,inplace=True)
df1.fillna(0,inplace=True)

In [25]:
#cosine simularity
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(df)
similarity_scores.shape

(791, 791)

In [26]:
similarity_scores

array([[1.        , 0.59671396, 0.58775479, ..., 0.62510637, 0.61193687,
        0.67061426],
       [0.59671396, 1.        , 0.63828935, ..., 0.52034784, 0.52022317,
        0.60651434],
       [0.58775479, 0.63828935, 1.        , ..., 0.69453301, 0.61961764,
        0.64625129],
       ...,
       [0.62510637, 0.52034784, 0.69453301, ..., 1.        , 0.74656204,
        0.65302658],
       [0.61193687, 0.52022317, 0.61961764, ..., 0.74656204, 1.        ,
        0.68600489],
       [0.67061426, 0.60651434, 0.64625129, ..., 0.65302658, 0.68600489,
        1.        ]])

In [27]:
#create a function

def recommend_movie(title):
    title = title.lower()
    title = new_title(title)
 
    index = np.where(df.index==title)[0][0]

    s_scores = enumerate(similarity_scores[index])
    s_scores = sorted(s_scores,key=lambda x: x[1],reverse=True)

    s_scores = s_scores[1:11]

    s_index = [i[0] for i in s_scores]
    

    for i in s_index:
        print(df1.index[i][1])
recommend_movie("Toy story")

Toy Story 2
Jurassic Park
Back to the Future
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)
Star Wars: Episode IV - A New Hope
Sixth Sense, The
Monsters, Inc.
Matrix, The
Silence of the Lambs, The
Star Wars: Episode V - The Empire Strikes Back


In [28]:
#Popularity based techniques
user_item_matrix = df.T
def recommend_popular(user_item_matrix, N=10):
    avg_ratings = user_item_matrix.mean(axis=0).sort_values(ascending=False)
    return list(avg_ratings.index[:N])
recommend_popular(user_item_matrix,N=10)

['shawshank redemption the',
 'pulp fiction',
 'silence of the lambs the',
 'raiders of the lost ark indiana jones and the raiders of the lost ark',
 'matrix the',
 'star wars episode iv  a new hope',
 'star wars episode v  the empire strikes back',
 'godfather the',
 'back to the future',
 'fight club']

In [29]:
def recommend_content_based(title, df, similarity_scores, top_n=5):
    title = title.lower()
    if title not in df.index:
        return []

    index = np.where(df.index == title)[0][0]
    s_scores = list(enumerate(similarity_scores[index]))
    s_scores = sorted(s_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    recommended_titles = [df.index[i[0]] for i in s_scores]
    return recommended_titles


def hybrid_recommend(title, df, similarity_scores, user_item_matrix=None, N=10):
    """
    Hybrid recommendation using only a movie title (no user ID).
    """
    # 1. Content-based recommendations from title
    cb_recs = recommend_content_based(title, df, similarity_scores, top_n=N//2)

    # 2. Popularity-based fallback
    pop_recs = recommend_popular(user_item_matrix, N=N//2) if user_item_matrix is not None else []

    # Combine and deduplicate
    def extract_title(movie_entry):
        return movie_entry[0] if isinstance(movie_entry, tuple) else movie_entry

    combined = cb_recs + [extract_title(m) for m in pop_recs]
    seen = set()
    unique_recs = []
    for rec in combined:
        title_clean = extract_title(rec)
        if title_clean not in seen:
            unique_recs.append(title_clean)
            seen.add(title_clean)
        if len(unique_recs) >= N:
            break

    return unique_recs


In [36]:
title = input("Enter a movie title: ").strip().lower()

recommendations = hybrid_recommend(
    title,
    df,
    similarity_scores,
    user_item_matrix,  # optional: remove if not using popularity
    N=10
)

print("\nRecommended Movies:")
for movie in recommendations:
    print(movie.title())


Enter a movie title:  cinderela



Recommended Movies:
Shawshank Redemption The
Pulp Fiction
Silence Of The Lambs The
Raiders Of The Lost Ark Indiana Jones And The Raiders Of The Lost Ark
Matrix The
