In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

In [2]:
movies = pd.read_csv('movies.csv')

In [3]:
movies.isna().sum()

id                       0
original_language        0
original_title           1
overview             45809
popularity               0
poster_path          64975
release_date            27
title                    1
vote_average             0
vote_count               0
budget                   0
origin_country           0
revenue                  0
keywords                 0
actors                   0
director                 0
genre_names              0
dtype: int64

In [4]:
# Drop movies without overview, because this feature is the most important in the model
movies = movies.dropna(subset=['overview', 'release_date', 'title'])

In [5]:
# Drop records with an empty list
movies = movies[movies['keywords'] != '[]']
movies = movies[movies['genre_names'] != '[]']

In [6]:
movies.shape

(119616, 17)

In [7]:
# Keep movies with ratings larger or equal to 6 for better recommendations
movies = movies[movies['vote_average'] >= 6]
movies = movies.reset_index()
movies.shape

(52036, 18)

In [8]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import PorterStemmer
import re

# Function to preprocess text (lowercase, remove special characters, remove stopwords, stemming)
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Re-join tokens to form the cleaned text
    return ' '.join(tokens)

In [9]:
tqdm.pandas()
# Apply preprocessing to 'overview'
movies['overview_'] = movies['overview'].fillna('').progress_apply(preprocess_text)

# Since 'keywords' are in list format, we need to convert them to string first and then apply preprocessing
movies['keywords_'] = movies['keywords'].progress_apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
movies['keywords_'] = movies['keywords'].progress_apply(preprocess_text)
movies['genre_names_'] = movies['genre_names'].progress_apply(lambda x: re.sub(r'[^\w\s]', ' ', x))


  0%|          | 0/52036 [00:00<?, ?it/s]

  0%|          | 0/52036 [00:00<?, ?it/s]

  0%|          | 0/52036 [00:00<?, ?it/s]

  0%|          | 0/52036 [00:00<?, ?it/s]

In [10]:
movies.head()

Unnamed: 0,index,id,original_language,original_title,overview,popularity,poster_path,release_date,title,vote_average,...,budget,origin_country,revenue,keywords,actors,director,genre_names,overview_,keywords_,genre_names_
0,0,823464,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",4619.309,/z1p34vh7dEOnLDmyCrlUVLuoDzd.jpg,2024-03-27,Godzilla x Kong: The New Empire,7.281,...,150000000,['US'],558503759,"['giant monster', 'sequel', 'dinosaur', 'kaiju...","[(15556, 'Rebecca Hall'), (226366, 'Brian Tyre...","(98631, 'Adam Wingard')","['Science Fiction', 'Action', 'Adventure']",follow explos showdown godzilla kong reunit co...,giant monster sequel dinosaur kaiju fantasi wo...,Science Fiction Action Adventure
1,1,653346,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,1627.925,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,2024-05-08,Kingdom of the Planet of the Apes,7.159,...,160000000,['US'],237000000,"['empire', 'kingdom', 'gorilla', 'dystopia', '...","[(1586047, 'Owen Teague'), (2146942, 'Freya Al...","(1179066, 'Wes Ball')","['Science Fiction', 'Adventure', 'Action']",gener futur follow caesar reign ape domin spec...,empir kingdom gorilla dystopia eagl sequel ant...,Science Fiction Adventure Action
2,2,786892,en,Furiosa: A Mad Max Saga,"As the world fell, young Furiosa is snatched f...",1443.985,/iADOJ8Zymht2JPMoy3R7xceZprc.jpg,2024-05-22,Furiosa: A Mad Max Saga,7.732,...,150000000,"['AU', 'US']",10200000,"['post-apocalyptic future', 'prequel', 'spin o...","[(1397778, 'Anya Taylor-Joy'), (74568, 'Chris ...","(20629, 'George Miller')","['Action', 'Adventure', 'Science Fiction']",world fell young furiosa snatch green place mo...,postapocalypt futur prequel spin psychot angri...,Action Adventure Science Fiction
3,4,929590,en,Civil War,"In the near future, a group of war journalists...",1008.722,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,7.34,...,50000000,['GB'],108969206,"['sniper', 'new york city', 'race against time...","[(205, 'Kirsten Dunst'), (52583, 'Wagner Moura...","(2036, 'Alex Garland')","['War', 'Action', 'Drama']",near futur group war journalist attempt surviv...,sniper new york citi race time washington dc u...,War Action Drama
4,5,940721,ja,ゴジラ-1.0,Postwar Japan is at its lowest point when a ne...,1042.402,/hkxxMIGaiCTmrEArK7J56JTKUlB.jpg,2023-11-03,Godzilla Minus One,7.667,...,15000000,['JP'],115857413,"['monster', 'loss of loved one', 'giant monste...","[(225730, 'Ryunosuke Kamiki'), (1516266, 'Mina...","(43652, 'Takashi Yamazaki')","['Science Fiction', 'Horror', 'Action']",postwar japan lowest point new crisi emerg for...,monster loss love giant monster kamikaz duti a...,Science Fiction Horror Action


In [11]:
# Combine the textual features into one
movies['combined_features'] = (movies['overview_'] + ' ' 
                                    + movies['keywords_'] + ' '
                                       + movies['keywords_'] + ' '
                                    + movies['genre_names_'] + ' '
                                      + movies['genre_names_'] + ' '
                                      + movies['genre_names_'] + ' ')


In [13]:
movies.loc[0]['combined_features']

'follow explos showdown godzilla kong reunit coloss undiscov threat hidden world challeng exist giant monster sequel dinosaur kaiju fantasi world giant ape godzilla king kong mongkey giant monster sequel dinosaur kaiju fantasi world giant ape godzilla king kong mongkey   Science Fiction    Action    Adventure     Science Fiction    Action    Adventure     Science Fiction    Action    Adventure   '

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the combined textual features to a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['combined_features'])

# Display the shape of the TF-IDF matrix to confirm its size
tfidf_matrix.shape


(52036, 73374)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix)

# Display a portion of the cosine similarity matrix to verify
cosine_sim[:5, :5]


array([[1.        , 0.23397631, 0.10145834, 0.02760195, 0.26820234],
       [0.23397631, 1.        , 0.14139442, 0.06876236, 0.04612065],
       [0.10145834, 0.14139442, 1.        , 0.08364336, 0.05495209],
       [0.02760195, 0.06876236, 0.08364336, 1.        , 0.126082  ],
       [0.26820234, 0.04612065, 0.05495209, 0.126082  , 1.        ]])

In [16]:
cosine_sim.shape

(52036, 52036)

In [17]:
# define a function that takes a movie id and determines the top 5 similar movies to recommend 
def recommend_movies(movie_id, top_n=6):
    # Get the index of the movie that matches the title
    movie_idx = movies[movies['id'] == movie_id].index[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[movie_idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top-n most similar movies
    sim_scores = sim_scores[0:top_n+1]  # including self to check

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top n most similar movies
    return list(movies.iloc[movie_indices]['id'])

# Use the function to find movies similar to "Memories of Murder" (id = 11423)
recommended_movies = recommend_movies(11423)
recommended_movies


[11423, 544627, 28979, 449196, 345924, 280290, 11707]

In [18]:
movies['Ranks'] = movies['id'].progress_apply(recommend_movies)

  0%|          | 0/52036 [00:00<?, ?it/s]

In [19]:
movies.head()

Unnamed: 0,index,id,original_language,original_title,overview,popularity,poster_path,release_date,title,vote_average,...,revenue,keywords,actors,director,genre_names,overview_,keywords_,genre_names_,combined_features,Ranks
0,0,823464,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",4619.309,/z1p34vh7dEOnLDmyCrlUVLuoDzd.jpg,2024-03-27,Godzilla x Kong: The New Empire,7.281,...,558503759,"['giant monster', 'sequel', 'dinosaur', 'kaiju...","[(15556, 'Rebecca Hall'), (226366, 'Brian Tyre...","(98631, 'Adam Wingard')","['Science Fiction', 'Action', 'Adventure']",follow explos showdown godzilla kong reunit co...,giant monster sequel dinosaur kaiju fantasi wo...,Science Fiction Action Adventure,follow explos showdown godzilla kong reunit co...,"[823464, 399566, 686487, 1680, 39466, 1035326,..."
1,1,653346,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,1627.925,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,2024-05-08,Kingdom of the Planet of the Apes,7.159,...,237000000,"['empire', 'kingdom', 'gorilla', 'dystopia', '...","[(1586047, 'Owen Teague'), (2146942, 'Freya Al...","(1179066, 'Wes Ball')","['Science Fiction', 'Adventure', 'Action']",gener futur follow caesar reign ape domin spec...,empir kingdom gorilla dystopia eagl sequel ant...,Science Fiction Adventure Action,gener futur follow caesar reign ape domin spec...,"[653346, 281338, 61791, 119450, 1688, 39314, 871]"
2,2,786892,en,Furiosa: A Mad Max Saga,"As the world fell, young Furiosa is snatched f...",1443.985,/iADOJ8Zymht2JPMoy3R7xceZprc.jpg,2024-05-22,Furiosa: A Mad Max Saga,7.732,...,10200000,"['post-apocalyptic future', 'prequel', 'spin o...","[(1397778, 'Anya Taylor-Joy'), (74568, 'Chris ...","(20629, 'George Miller')","['Action', 'Adventure', 'Science Fiction']",world fell young furiosa snatch green place mo...,postapocalypt futur prequel spin psychot angri...,Action Adventure Science Fiction,world fell young furiosa snatch green place mo...,"[786892, 9355, 76341, 9659, 8810, 629014, 310135]"
3,4,929590,en,Civil War,"In the near future, a group of war journalists...",1008.722,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,7.34,...,108969206,"['sniper', 'new york city', 'race against time...","[(205, 'Kirsten Dunst'), (52583, 'Wagner Moura...","(2036, 'Alex Garland')","['War', 'Action', 'Drama']",near futur group war journalist attempt surviv...,sniper new york citi race time washington dc u...,War Action Drama,near futur group war journalist attempt surviv...,"[929590, 356617, 388875, 636835, 517148, 34762..."
4,5,940721,ja,ゴジラ-1.0,Postwar Japan is at its lowest point when a ne...,1042.402,/hkxxMIGaiCTmrEArK7J56JTKUlB.jpg,2023-11-03,Godzilla Minus One,7.667,...,115857413,"['monster', 'loss of loved one', 'giant monste...","[(225730, 'Ryunosuke Kamiki'), (1516266, 'Mina...","(43652, 'Takashi Yamazaki')","['Science Fiction', 'Horror', 'Action']",postwar japan lowest point new crisi emerg for...,monster loss love giant monster kamikaz duti a...,Science Fiction Horror Action,postwar japan lowest point new crisi emerg for...,"[940721, 1678, 39256, 39468, 36243, 466939, 11..."


In [20]:
del movies['index']

In [21]:
movies.head()

Unnamed: 0,id,original_language,original_title,overview,popularity,poster_path,release_date,title,vote_average,vote_count,...,revenue,keywords,actors,director,genre_names,overview_,keywords_,genre_names_,combined_features,Ranks
0,823464,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",4619.309,/z1p34vh7dEOnLDmyCrlUVLuoDzd.jpg,2024-03-27,Godzilla x Kong: The New Empire,7.281,2120.0,...,558503759,"['giant monster', 'sequel', 'dinosaur', 'kaiju...","[(15556, 'Rebecca Hall'), (226366, 'Brian Tyre...","(98631, 'Adam Wingard')","['Science Fiction', 'Action', 'Adventure']",follow explos showdown godzilla kong reunit co...,giant monster sequel dinosaur kaiju fantasi wo...,Science Fiction Action Adventure,follow explos showdown godzilla kong reunit co...,"[823464, 399566, 686487, 1680, 39466, 1035326,..."
1,653346,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,1627.925,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,2024-05-08,Kingdom of the Planet of the Apes,7.159,571.0,...,237000000,"['empire', 'kingdom', 'gorilla', 'dystopia', '...","[(1586047, 'Owen Teague'), (2146942, 'Freya Al...","(1179066, 'Wes Ball')","['Science Fiction', 'Adventure', 'Action']",gener futur follow caesar reign ape domin spec...,empir kingdom gorilla dystopia eagl sequel ant...,Science Fiction Adventure Action,gener futur follow caesar reign ape domin spec...,"[653346, 281338, 61791, 119450, 1688, 39314, 871]"
2,786892,en,Furiosa: A Mad Max Saga,"As the world fell, young Furiosa is snatched f...",1443.985,/iADOJ8Zymht2JPMoy3R7xceZprc.jpg,2024-05-22,Furiosa: A Mad Max Saga,7.732,218.0,...,10200000,"['post-apocalyptic future', 'prequel', 'spin o...","[(1397778, 'Anya Taylor-Joy'), (74568, 'Chris ...","(20629, 'George Miller')","['Action', 'Adventure', 'Science Fiction']",world fell young furiosa snatch green place mo...,postapocalypt futur prequel spin psychot angri...,Action Adventure Science Fiction,world fell young furiosa snatch green place mo...,"[786892, 9355, 76341, 9659, 8810, 629014, 310135]"
3,929590,en,Civil War,"In the near future, a group of war journalists...",1008.722,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,7.34,818.0,...,108969206,"['sniper', 'new york city', 'race against time...","[(205, 'Kirsten Dunst'), (52583, 'Wagner Moura...","(2036, 'Alex Garland')","['War', 'Action', 'Drama']",near futur group war journalist attempt surviv...,sniper new york citi race time washington dc u...,War Action Drama,near futur group war journalist attempt surviv...,"[929590, 356617, 388875, 636835, 517148, 34762..."
4,940721,ja,ゴジラ-1.0,Postwar Japan is at its lowest point when a ne...,1042.402,/hkxxMIGaiCTmrEArK7J56JTKUlB.jpg,2023-11-03,Godzilla Minus One,7.667,1150.0,...,115857413,"['monster', 'loss of loved one', 'giant monste...","[(225730, 'Ryunosuke Kamiki'), (1516266, 'Mina...","(43652, 'Takashi Yamazaki')","['Science Fiction', 'Horror', 'Action']",postwar japan lowest point new crisi emerg for...,monster loss love giant monster kamikaz duti a...,Science Fiction Horror Action,postwar japan lowest point new crisi emerg for...,"[940721, 1678, 39256, 39468, 36243, 466939, 11..."


In [22]:
movies.head()

Unnamed: 0,id,original_language,original_title,overview,popularity,poster_path,release_date,title,vote_average,vote_count,...,revenue,keywords,actors,director,genre_names,overview_,keywords_,genre_names_,combined_features,Ranks
0,823464,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",4619.309,/z1p34vh7dEOnLDmyCrlUVLuoDzd.jpg,2024-03-27,Godzilla x Kong: The New Empire,7.281,2120.0,...,558503759,"['giant monster', 'sequel', 'dinosaur', 'kaiju...","[(15556, 'Rebecca Hall'), (226366, 'Brian Tyre...","(98631, 'Adam Wingard')","['Science Fiction', 'Action', 'Adventure']",follow explos showdown godzilla kong reunit co...,giant monster sequel dinosaur kaiju fantasi wo...,Science Fiction Action Adventure,follow explos showdown godzilla kong reunit co...,"[823464, 399566, 686487, 1680, 39466, 1035326,..."
1,653346,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,1627.925,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,2024-05-08,Kingdom of the Planet of the Apes,7.159,571.0,...,237000000,"['empire', 'kingdom', 'gorilla', 'dystopia', '...","[(1586047, 'Owen Teague'), (2146942, 'Freya Al...","(1179066, 'Wes Ball')","['Science Fiction', 'Adventure', 'Action']",gener futur follow caesar reign ape domin spec...,empir kingdom gorilla dystopia eagl sequel ant...,Science Fiction Adventure Action,gener futur follow caesar reign ape domin spec...,"[653346, 281338, 61791, 119450, 1688, 39314, 871]"
2,786892,en,Furiosa: A Mad Max Saga,"As the world fell, young Furiosa is snatched f...",1443.985,/iADOJ8Zymht2JPMoy3R7xceZprc.jpg,2024-05-22,Furiosa: A Mad Max Saga,7.732,218.0,...,10200000,"['post-apocalyptic future', 'prequel', 'spin o...","[(1397778, 'Anya Taylor-Joy'), (74568, 'Chris ...","(20629, 'George Miller')","['Action', 'Adventure', 'Science Fiction']",world fell young furiosa snatch green place mo...,postapocalypt futur prequel spin psychot angri...,Action Adventure Science Fiction,world fell young furiosa snatch green place mo...,"[786892, 9355, 76341, 9659, 8810, 629014, 310135]"
3,929590,en,Civil War,"In the near future, a group of war journalists...",1008.722,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,7.34,818.0,...,108969206,"['sniper', 'new york city', 'race against time...","[(205, 'Kirsten Dunst'), (52583, 'Wagner Moura...","(2036, 'Alex Garland')","['War', 'Action', 'Drama']",near futur group war journalist attempt surviv...,sniper new york citi race time washington dc u...,War Action Drama,near futur group war journalist attempt surviv...,"[929590, 356617, 388875, 636835, 517148, 34762..."
4,940721,ja,ゴジラ-1.0,Postwar Japan is at its lowest point when a ne...,1042.402,/hkxxMIGaiCTmrEArK7J56JTKUlB.jpg,2023-11-03,Godzilla Minus One,7.667,1150.0,...,115857413,"['monster', 'loss of loved one', 'giant monste...","[(225730, 'Ryunosuke Kamiki'), (1516266, 'Mina...","(43652, 'Takashi Yamazaki')","['Science Fiction', 'Horror', 'Action']",postwar japan lowest point new crisi emerg for...,monster loss love giant monster kamikaz duti a...,Science Fiction Horror Action,postwar japan lowest point new crisi emerg for...,"[940721, 1678, 39256, 39468, 36243, 466939, 11..."


In [23]:
# Convert 'release_date' to datetime if it's not already
movies['release_date'] = pd.to_datetime(movies['release_date'])

# Extract the year
movies['year'] = movies['release_date'].dt.year

In [24]:
movies['formatted_title'] = movies['title'] + ' (' + movies['year'].astype(str) + ')'

In [25]:
movies.head()

Unnamed: 0,id,original_language,original_title,overview,popularity,poster_path,release_date,title,vote_average,vote_count,...,actors,director,genre_names,overview_,keywords_,genre_names_,combined_features,Ranks,year,formatted_title
0,823464,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",4619.309,/z1p34vh7dEOnLDmyCrlUVLuoDzd.jpg,2024-03-27,Godzilla x Kong: The New Empire,7.281,2120.0,...,"[(15556, 'Rebecca Hall'), (226366, 'Brian Tyre...","(98631, 'Adam Wingard')","['Science Fiction', 'Action', 'Adventure']",follow explos showdown godzilla kong reunit co...,giant monster sequel dinosaur kaiju fantasi wo...,Science Fiction Action Adventure,follow explos showdown godzilla kong reunit co...,"[823464, 399566, 686487, 1680, 39466, 1035326,...",2024,Godzilla x Kong: The New Empire (2024)
1,653346,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,1627.925,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,2024-05-08,Kingdom of the Planet of the Apes,7.159,571.0,...,"[(1586047, 'Owen Teague'), (2146942, 'Freya Al...","(1179066, 'Wes Ball')","['Science Fiction', 'Adventure', 'Action']",gener futur follow caesar reign ape domin spec...,empir kingdom gorilla dystopia eagl sequel ant...,Science Fiction Adventure Action,gener futur follow caesar reign ape domin spec...,"[653346, 281338, 61791, 119450, 1688, 39314, 871]",2024,Kingdom of the Planet of the Apes (2024)
2,786892,en,Furiosa: A Mad Max Saga,"As the world fell, young Furiosa is snatched f...",1443.985,/iADOJ8Zymht2JPMoy3R7xceZprc.jpg,2024-05-22,Furiosa: A Mad Max Saga,7.732,218.0,...,"[(1397778, 'Anya Taylor-Joy'), (74568, 'Chris ...","(20629, 'George Miller')","['Action', 'Adventure', 'Science Fiction']",world fell young furiosa snatch green place mo...,postapocalypt futur prequel spin psychot angri...,Action Adventure Science Fiction,world fell young furiosa snatch green place mo...,"[786892, 9355, 76341, 9659, 8810, 629014, 310135]",2024,Furiosa: A Mad Max Saga (2024)
3,929590,en,Civil War,"In the near future, a group of war journalists...",1008.722,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,7.34,818.0,...,"[(205, 'Kirsten Dunst'), (52583, 'Wagner Moura...","(2036, 'Alex Garland')","['War', 'Action', 'Drama']",near futur group war journalist attempt surviv...,sniper new york citi race time washington dc u...,War Action Drama,near futur group war journalist attempt surviv...,"[929590, 356617, 388875, 636835, 517148, 34762...",2024,Civil War (2024)
4,940721,ja,ゴジラ-1.0,Postwar Japan is at its lowest point when a ne...,1042.402,/hkxxMIGaiCTmrEArK7J56JTKUlB.jpg,2023-11-03,Godzilla Minus One,7.667,1150.0,...,"[(225730, 'Ryunosuke Kamiki'), (1516266, 'Mina...","(43652, 'Takashi Yamazaki')","['Science Fiction', 'Horror', 'Action']",postwar japan lowest point new crisi emerg for...,monster loss love giant monster kamikaz duti a...,Science Fiction Horror Action,postwar japan lowest point new crisi emerg for...,"[940721, 1678, 39256, 39468, 36243, 466939, 11...",2023,Godzilla Minus One (2023)


In [26]:
movies.to_csv('data.csv', index = False)