<a href="https://colab.research.google.com/github/valievav/ML-projects/blob/main/Movies_recommendation_system_using_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import pandas as pd
import numpy as np
from typing import List
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# do NOT use this link https://github.com/ChitranjanUpadhayay/ML_Projects/tree/main/Datasets/Movies%20Recommendation%20System/dataset.csv
# it returns HTML page instead of file data

# get data
url = 'https://raw.githubusercontent.com/ChitranjanUpadhayay/ML_Projects/refs/heads/main/Datasets/Movies%20Recommendation%20System/dataset.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [16]:
# add new column with keywords that can be used by model
df['tags'] = df['genre'] + ' ' + df['overview']

# create new df with only relevant infotmation for the model
new_df = df[['id', 'title', 'tags']]
new_df.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime In the continuing saga of the Corl..."


In [21]:
# convert textual to numerical data
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
matrix = vectorizer.fit_transform(new_df['tags'].values.astype('U')).toarray()
matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
# check features in vectorizer
vectorizer.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zones', 'zoo'], dtype=object)

In [31]:
# preview feature-count pairs
list(vectorizer.vocabulary_.items())[:10]

[('drama', 2777),
 ('crime', 2140),
 ('framed', 3622),
 ('1940s', 35),
 ('double', 2751),
 ('murder', 5965),
 ('wife', 9797),
 ('lover', 5363),
 ('banker', 836),
 ('andy', 470)]

In [47]:
# check top N feature-count pairs
words_count = sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1], reverse=True)
words_count[:10]

[('zoo', 9999),
 ('zones', 9998),
 ('zone', 9997),
 ('zombies', 9996),
 ('zombie', 9995),
 ('zoe', 9994),
 ('zion', 9993),
 ('zimmerman', 9992),
 ('zhen', 9991),
 ('zeus', 9990)]

In [39]:
# find similarity between vectors (movies)
sim = cosine_similarity(matrix)
sim

array([[1.        , 0.05634362, 0.13041013, ..., 0.07559289, 0.11065667,
        0.06900656],
       [0.05634362, 1.        , 0.07715167, ..., 0.        , 0.03636965,
        0.        ],
       [0.13041013, 0.07715167, 1.        , ..., 0.02300219, 0.0673435 ,
        0.09449112],
       ...,
       [0.07559289, 0.        , 0.02300219, ..., 1.        , 0.03253   ,
        0.03042903],
       [0.11065667, 0.03636965, 0.0673435 , ..., 0.03253   , 1.        ,
        0.04454354],
       [0.06900656, 0.        , 0.09449112, ..., 0.03042903, 0.04454354,
        1.        ]])

In [40]:
# get all genre to explore data
all_genre = []
for x in df['genre']:
  if isinstance(x, str):
    all_genre.extend(x.split(','))

all_genre = set(all_genre)
all_genre

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

In [48]:
# check for sci-fy movies to get movie example
df[df['genre'].str.contains('Science Fiction', na=False)]['title'][:10]


Unnamed: 0,title
34,Evangelion: 3.0+1.0 Thrice Upon a Time
35,Spider-Man: Into the Spider-Verse
41,Neon Genesis Evangelion: The End of Evangelion
42,The Empire Strikes Back
49,Interstellar
54,Inception
74,Justice League Dark: Apokolips War
75,Back to the Future
86,Avengers: Endgame
91,Steven Universe: The Movie


In [65]:
# check similarity on 1 movie example
movie_title = 'Neon Genesis Evangelion: The End of Evangelion'
index = new_df[new_df['title'] == movie_title].index[0]
distance = sorted(enumerate(sim[index]), key=lambda item:item[1], reverse=True)

print(index)
print(distance)
print(new_df.iloc[index])

41
[(41, 1.0), (1431, 0.35921060405354976), (863, 0.34425764186604446), (4508, 0.28112676511587464), (5026, 0.25923792368260634), (6557, 0.24096579867074966), (8972, 0.24096579867074966), (3540, 0.24000768036865966), (433, 0.22975187432024505), (34, 0.22439708538128578), (2295, 0.21780342093451605), (7295, 0.21780342093451605), (7421, 0.21780342093451605), (2348, 0.21166687833365086), (4475, 0.21166687833365086), (1495, 0.20953951903123738), (485, 0.20739033894608508), (3596, 0.20739033894608508), (8113, 0.20739033894608508), (4551, 0.2060214108575823), (5213, 0.2060214108575823), (696, 0.2036532699906392), (1424, 0.20080483222562473), (3587, 0.20080483222562473), (2411, 0.19925419255468718), (2730, 0.19925419255468718), (4424, 0.19925419255468718), (5983, 0.19925419255468718), (6824, 0.19925419255468718), (2264, 0.19596545041740515), (3347, 0.19596545041740515), (8331, 0.19596545041740515), (9503, 0.19596545041740515), (500, 0.19354838709677422), (8914, 0.19200614429492774), (4488, 0.

In [77]:
def recommend(movie: str, count: int = 5) -> List[List]:
  """
  Recommend movies based on passed mivie_name and number of recommendations.
  Return list of list with movie names and similarity scores.
  """
  index = new_df[new_df['title'] == movie].index[0]
  distance = sorted(enumerate(sim[index]), key=lambda item:item[1], reverse=True)

  recommendations = []
  for data in distance[:count]:
    movie_index, similarity_score = data
    name = new_df.iloc[movie_index]['title']
    recommendations.append([name, similarity_score])

  return recommendations

# get recommendations with similarity score
recommend(movie_title, 10)

[['Neon Genesis Evangelion: The End of Evangelion', 1.0],
 ['Neon Genesis Evangelion: Death and Rebirth', 0.35921060405354976],
 ['Evangelion: 1.0 You Are (Not) Alone', 0.34425764186604446],
 ['Batman: Gotham by Gaslight', 0.28112676511587464],
 ['Dragon Ball Z: The Return of Cooler', 0.25923792368260634],
 ['Synchronic', 0.24096579867074966],
 ['See You Yesterday', 0.24096579867074966],
 ['Superman vs. The Elite', 0.24000768036865966],
 ['Justice League: The Flashpoint Paradox', 0.22975187432024505],
 ['Evangelion: 3.0+1.0 Thrice Upon a Time', 0.22439708538128578]]