In [1]:
!pip install -q pandas scikit-learn numpy

In [2]:
from google.colab import files
uploaded = files.upload()
import io, pandas as pd
movies = pd.read_csv(io.BytesIO(uploaded['movies.csv']))
movies.head()

Saving movies.csv to movies.csv


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
print("shape:",movies.shape)
print("column:",movies.columns.tolist())
movies.info()
movies.isnull().sum()

shape: (9742, 3)
column: ['movieId', 'title', 'genres']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


Unnamed: 0,0
movieId,0
title,0
genres,0


In [4]:
movies['genres']=movies['genres'].fillna('')
movies['content']=movies['title']+" "+movies['genres']
movies[['title','genres','content']]

Unnamed: 0,title,genres,content
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy
...,...,...,...
9737,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic (2017) Acti...
9738,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life: Zero (2017) Animation|Comedy|...
9739,Flint (2017),Drama,Flint (2017) Drama
9740,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs: Dead Apple (2018) Action|Ani...


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer= CountVectorizer(stop_words='english')
count_matrix= vectorizer.fit_transform(movies['content'])
print('count_matrix shape: ', count_matrix.shape)

count_matrix shape:  (9742, 9060)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim= cosine_similarity(count_matrix, count_matrix)
print('shape of cosine matrix:',cosine_sim)

shape of cosine matrix: [[1.         0.63245553 0.28867513 ... 0.         0.125      0.11785113]
 [0.63245553 1.         0.18257419 ... 0.         0.         0.        ]
 [0.28867513 0.18257419 1.         ... 0.         0.         0.13608276]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.125      0.         0.         ... 0.         1.         0.        ]
 [0.11785113 0.         0.13608276 ... 0.         0.         1.        ]]


In [7]:
import re
def normalize_title(title):
    t = title.lower().strip()

    # Case: "The Godfather (1972)" → "godfather, the (1972)"
    m = re.match(r"(the|a|an)\s+(.*)\s\((\d{4})\)$", t)
    if m:
        article, name, year = m.groups()
        return f"{name}, {article} ({year})"

    # Case: normal: "Toy Story (1995)"
    m2 = re.match(r"(.*)\((\d{4})\)$", t)
    if m2:
        name, year = m2.groups()
        return f"{name.strip()} ({year})"

    return t


movies['title_normalized'] = movies['title'].apply(lambda x: normalize_title(x.lower()))

def recommend(movie_title, top_k=5):
    movie_title_clean = normalize_title(movie_title.lower())

    # exact match first
    matches = movies[movies['title_normalized'] == movie_title_clean]

    # fallback: contains
    if matches.empty:
        matches = movies[movies['title_normalized'].str.contains(movie_title_clean, regex=False)]

    if matches.empty:
        print(f"movie not found. Try a different movie: {movie_title}")
        return None

    idx = matches.index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in sim_scores[1:top_k+1]]

    return movies.loc[top_indices, ['title', 'genres']].reset_index(drop=True)


In [8]:
#test
for q in ["The Godfather (1972)", "Toy Story (1995)", "Jumanji (1995)", "Pulp Fiction (1994)"]:
  print("Because you watched ",q," :")
  print(recommend(q, top_k=5))
  print("-"*50)

Because you watched  The Godfather (1972)  :
                             title                        genres
0   Godfather: Part II, The (1974)                   Crime|Drama
1              Getaway, The (1972)   Action|Crime|Drama|Thriller
2            Candidate, The (1972)                         Drama
3                      Fuzz (1972)                         Drama
4  Godfather: Part III, The (1990)  Crime|Drama|Mystery|Thriller
--------------------------------------------------
Because you watched  Toy Story (1995)  :
                                   title  \
0                     Toy Story 2 (1999)   
1                     Toy Story 3 (2010)   
2                            Antz (1998)   
3  We're Back! A Dinosaur's Story (1993)   
4                  Monsters, Inc. (2001)   

                                             genres  
0       Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy|IMAX  
2       Adventure|Animation|Children|Comedy|Fa