In [48]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import contractions
import os
from IPython.display import Image, display, HTML

stop_words = nltk.corpus.stopwords.words('english')

## DATA INGESTION

In [2]:
def load_data(data):
    # Load data in dataframe
    df = pd.read_csv(data)
    return df

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Personal AI Projects\\PROJECTS\\NLP PROJECTS\\MOVIE STREAMING RECOMMENDATION PROJECT'

In [5]:
df = load_data('tmdb_5000_movies.csv')

In [6]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

## EDA

#### Check for Null values

In [8]:
df.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [9]:
# Create new dataframe with relevant data

df = df[['id', 'title', 'tagline', 'overview', 'popularity']]

df.tagline.fillna('', inplace=True)

df.dropna(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.tagline.fillna('', inplace=True)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4800 entries, 0 to 4802
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          4800 non-null   int64  
 1   title       4800 non-null   object 
 2   tagline     4800 non-null   object 
 3   overview    4800 non-null   object 
 4   popularity  4800 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 225.0+ KB


## FEATURE ENGINEERING

In [16]:
# Create a new feature
df['description'] = df['tagline'].map(str) + ' ' + df['overview']

In [17]:
df.head()

Unnamed: 0,id,title,tagline,overview,popularity,description
0,19995,Avatar,Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",150.437577,Enter the World of Pandora. In the 22nd centur...
1,285,Pirates of the Caribbean: At World's End,"At the end of the world, the adventure begins.","Captain Barbossa, long believed to be dead, ha...",139.082615,"At the end of the world, the adventure begins...."
2,206647,Spectre,A Plan No One Escapes,A cryptic message from Bond’s past sends him o...,107.376788,A Plan No One Escapes A cryptic message from B...
3,49026,The Dark Knight Rises,The Legend Ends,Following the death of District Attorney Harve...,112.31295,The Legend Ends Following the death of Distric...
4,49529,John Carter,"Lost in our world, found in another.","John Carter is a war-weary, former military ca...",43.926995,"Lost in our world, found in another. John Cart..."


## DATA PREPROCESIING

In [18]:
def preprocess_text(text):
    # Remove non-alphanumeric characters, strip whitespace, and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = text.strip()
    text = contractions.fix(text)
    # tokenize document
    tokens = nltk.word_tokenize(text)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    text = ' '.join(filtered_tokens)
    return text

In [19]:
# Apply text preprocessing to new feature which is the corpus

df['description'] = df['description'].apply(preprocess_text)

In [20]:
df['description']

0       enter world pandora 22nd century paraplegic ma...
1       end world adventure begins captain barbossa lo...
2       plan one escapes cryptic message bonds past se...
3       legend ends following death district attorney ...
4       lost world found another john carter warweary ...
                              ...                        
4798    come looking trouble trouble came looking el m...
4799    newlywed couples honeymoon upended arrivals re...
4800    signed sealed delivered introduces dedicated q...
4801    new yorker shanghai ambitious new york attorne...
4802    ever since second grade first saw et extraterr...
Name: description, Length: 4800, dtype: object

In [21]:
type(df['description'])

pandas.core.series.Series

In [22]:
'''
# Apply text preprocessing to new feature which is the corpus

normalize_corpus = np.vectorize(preprocess_text)

norm_corpus = normalize_corpus(list(df['description']))
'''

"\n# Apply text preprocessing to new feature which is the corpus\n\nnormalize_corpus = np.vectorize(preprocess_text)\n\nnorm_corpus = normalize_corpus(list(df['description']))\n"

## FEATURE EXTRACTION

In [23]:
def calculate_tfidf(documents):
    # Calculate TF-IDF vectors for the documents
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix

In [24]:
tfidf_matrix = calculate_tfidf(df['description'])

## DOCUMENT SIMILARITY COMPUTATION

In [25]:
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
similarity_matrix_df = pd.DataFrame(similarity_matrix)
similarity_matrix_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4790,4791,4792,4793,4794,4795,4796,4797,4798,4799
0,1.0,0.011839,0.0,0.021334,0.028296,0.025837,0.0,0.029902,0.0,0.00746,...,0.011681,0.0,0.024087,0.032401,0.0,0.0,0.0,0.006993,0.0,0.0
1,0.011839,1.0,0.015243,0.0,0.048035,0.0,0.01534,0.035783,0.046025,0.008957,...,0.014024,0.0,0.005818,0.0,0.0,0.013797,0.0,0.026581,0.015985,0.0
2,0.0,0.015243,1.0,0.0,0.0,0.0,0.0,0.029063,0.020833,0.005698,...,0.059455,0.0,0.0,0.0,0.017935,0.0,0.0,0.013734,0.0,0.00519
3,0.021334,0.0,0.0,1.0,0.010282,0.0,0.017052,0.03098,0.036909,0.087744,...,0.0,0.0,0.011829,0.0,0.0,0.0,0.0,0.034109,0.025712,0.021691
4,0.028296,0.048035,0.0,0.010282,1.0,0.0,0.009313,0.033732,0.0,0.024687,...,0.018588,0.0,0.0,0.0,0.0,0.010511,0.0,0.011128,0.0,0.0


In [26]:
type(similarity_matrix_df)

pandas.core.frame.DataFrame

## Get List of Movie Titles

In [27]:
## Get List of Movie Titles

def get_movie_titles(data):
    # Retrieve entire movies list from data
    movies_list = data['title'].values
    return movies_list

In [28]:
movie_titles = get_movie_titles(df)
movie_titles

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

## MODELING

In [35]:
import requests

In [36]:
def fetch_poster(movie_id):
    url = "https://api.themoviedb.org/3/movie/{}?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US".format(movie_id)
    data = requests.get(url)
    data = data.json()
    poster_path = data['poster_path']
    full_path = "https://image.tmdb.org/t/p/w500/" + poster_path
    return full_path

In [43]:
def recommend_movies(movie_title: str, movies = movie_titles, similarity = similarity_matrix_df, get_movie_poster: bool = False):
    # find movie id
    movie_index = np.where(movies == movie_title)[0][0]
    # get movie similarities
    movie_similarities = similarity.iloc[movie_index].values
    # get top 5 similar movie IDs
    similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
    # get top 5 movies
    similar_movies = movies[similar_movie_idxs]
    # Fetch corresponding IDs from the dataframe
    similar_movies_ids = [df[df['title'] == title].iloc[0].id for title in similar_movies]

    if get_movie_poster:
        similar_movie_posters = []
        # Fetch corresponding poster
        for id in similar_movies_ids:
            similar_movie_posters.append(fetch_poster(id))

        return similar_movies, similar_movie_posters

    else:
        # Combine movie titles and IDs
        similar_movies_with_ids = [f"{title} (ID: {movie_id})" for title, movie_id in zip(similar_movies, similar_movies_ids)]
        similar_movies_str = "\n".join(similar_movies_with_ids)

        # return the top 5 movies
        return similar_movies_str

In [45]:
if __name__ == "__main__":
    try:
        movie_title = input("Enter Movie Title: ")
        recommendations = recommend_movies(movie_title=movie_title)
        print(f'Top 5 recommended Movies for {movie_title}:\n{recommendations}')
    except ValueError:
        print("Invalid input. Please enter an integer value")

Top 5 recommended Movies for The Matrix:
Hackers (ID: 10428)
Pulse (ID: 9682)
Commando (ID: 10999)
The Inhabited Island (ID: 16911)
Transcendence (ID: 157353)


In [50]:
if __name__ == "__main__":
    try:
        movie_title = input("Enter Movie Title: ")
        recommended_movies, recommended_posters = recommend_movies(movie_title=movie_title, get_movie_poster=True)
        print(f'Top 5 recommended Movies for {movie_title}:\n')
        '''
        for title, poster in zip(recommended_movies, recommended_posters):
            print(title)
            display(Image(url=poster))
        '''
        # Create HTML content to display images in a row
        html_content = '<div style="display: flex; flex-direction: row;">'
        for title, poster in zip(recommended_movies, recommended_posters):
            html_content += f'''
                <div style="margin: 10px; text-align: center;">
                    <img src="{poster}" style="width: 150px; height: auto;">
                    <p>{title}</p>
                </div>
            '''
        html_content += '</div>'

        display(HTML(html_content))
    except ValueError:
        print("Invalid input. Please enter an integer value")


Top 5 recommended Movies for Batman:

