In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import sys
sys.path.append("../")

In [6]:
credits = pd.read_csv("../data/tmdb_5000_credits.csv")
movies = pd.read_csv("../data/tmdb_5000_movies.csv")

In [71]:
# standardize the joining columns
movies['title'] = movies['title'].apply(lambda x: str(x).strip().lower())
credits['title'] = credits['title'].apply(lambda x: str(x).strip().lower())

# merge both the dataset on title
movies_comb = movies.merge(credits, on='title', how='left')

## There are movies with same title but released in different years (they are probably remakes)

In [72]:
x = movies_comb.groupby('title')['crew'].count().reset_index()
print(x.query('crew > 1'))
print(movies_comb.query('title == "batman"')[['id', 'movie_id', 'title', 'original_title', 'release_date']])
print(movies_comb.query('title == "out of the blue"')[['id', 'movie_id', 'title', 'original_title', 'release_date']])
print(movies_comb.query('title == "the host"')[['id', 'movie_id', 'title', 'original_title', 'release_date']])

                title  crew
437            batman     4
2569  out of the blue     4
3796         the host     4
        id  movie_id   title original_title release_date
1360   268       268  batman         Batman   1989-06-23
1361   268      2661  batman         Batman   1989-06-23
4272  2661       268  batman         Batman   1966-07-30
4273  2661      2661  batman         Batman   1966-07-30
         id  movie_id            title   original_title release_date
3650  39269     39269  out of the blue  Out of the Blue   1980-05-01
3651  39269     10844  out of the blue  Out of the Blue   1980-05-01
3697  10844     39269  out of the blue  Out of the Blue   2006-10-12
3698  10844     10844  out of the blue  Out of the Blue   2006-10-12
         id  movie_id     title original_title release_date
972   72710     72710  the host       The Host   2013-03-22
973   72710      1255  the host       The Host   2013-03-22
2879   1255     72710  the host             괴물   2006-07-27
2880   1255      1

## Feature ideas
We are building a content based filtering recommender system i.e., recommendations based on user's past interest

Based on what an user can search a movie?
* title - title, original_title
* genre - genres
* cast - cast
* crew - crew

Based on user's past movies?
* content of the movie - keywords, overview, tagline
* language (user preferes few specific languages) - spoken_languages
* genre (user preferes few specific genres) - genres
* cast (user prefers few specific casts) - cast
* crew (user preferes christopher nolan movies) - crew
* runtime (may be user watches only shorter movies) - runtime
* popularity - vote_count, vote_average, popularity (may be user likes only popular movies)
* production companies (Marvels, DC, etc)
+ production countries (may/may not)
* Release year - release date (may be user watches most recent movies)

Useless features:
* revenue
* budget
* homepage
* status (Most of the movies are released anyway)
* original_language (Most of the languages are english (93%))

## Feature engineering to-do's
* extract release year from release date column
* Remove revenue, budget, homepage, status, original_language
* Explore information in dictionary columns - genres, keywords, production_companies, production_countries, spokern_languages, cast, crew
* Text preprocessing:
    * tokenization
    * remove space between words in a token (to remove confusion between similar names)
    * removing Punctuation & Special Characters
    * remove stop words
    * convert everything to lower case
    * concatenate all the information about the movie into one column that will be converted to embeddings
    * Note: there are repetitions in cast, character and crew names

In [73]:
# extract release year and drop columns
dates = pd.to_datetime(movies_comb['release_date'])
movies_comb['release_year'] = dates.dt.year
movies_comb.drop(['revenue', 'budget', 'homepage', 'status', 'original_language', 'release_date'], axis=1, inplace=True)
print(movies_comb.shape)

(4809, 18)


In [74]:
# function to extract elements of json file
def get_elements(x, dic_element_name='name'):
    """
    x is a list dictionary   
    """
    x = eval(x)
    keywords = []
    for dic in x:
        assert(isinstance(dic, dict))
        assert(dic_element_name in dic.keys())
        keywords.append(dic[dic_element_name])
    return keywords

# extract information from the columns
movies_comb['keywords2'] = movies_comb['keywords'].apply(lambda x: get_elements(x, 'name'))
movies_comb['production_companies2'] = movies_comb['production_companies'].apply(lambda x: get_elements(x, 'name'))
movies_comb['production_countries2'] = movies_comb['production_countries'].apply(lambda x: get_elements(x, 'name'))
movies_comb['spoken_languages2'] = movies_comb['spoken_languages'].apply(lambda x: get_elements(x, 'iso_639_1'))
movies_comb['cast2'] = movies_comb['cast'].apply(lambda x: get_elements(x, 'name'))
movies_comb['character2'] = movies_comb['cast'].apply(lambda x: get_elements(x, 'character'))
movies_comb['crew2'] = movies_comb['crew'].apply(lambda x: get_elements(x, 'name'))
movies_comb['genres2'] = movies_comb['genres'].apply(lambda x: get_elements(x, 'name'))

# drop unncessary columns
movies_comb.drop(['keywords', 'production_companies', 'production_countries', 'spoken_languages', 
                  'cast', 'crew', 'genres'], axis=1, inplace=True)

### Text preprocessing

In [75]:
import copy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [76]:
class Preprocess:
    def __init__(self, df):
        self.df = df

    def transform(self):
        # tokenize overview & tagline
        self.df['overview2'] = self.df['overview'].apply(lambda x: str(x).split())
        self.df['tagline2'] = self.df['tagline'].apply(lambda x: str(x).split())
        self.df['title2'] = self.df['title'].apply(lambda x: str(x).split())

        # concatenate overview, keywords2, production_companies2, production_countries2, spoken_languages2, cast2, crew2, character2, genres2, tagline
        self.df['combined_tag'] = self.df['title2'] + self.df['overview2'] + self.df['keywords2'] + self.df['production_companies2'] + self.df['production_countries2'] + \
                                self.df['spoken_languages2'] + self.df['cast2'] + self.df['crew2'] + self.df['character2'] + self.df['genres2'] + self.df['tagline2']
        
        # preprocess
        self.df['combined_tag'] = self.df['combined_tag'].apply(lambda x: self.inlist_removespace(x))
        self.df['combined_tag'] = self.df['combined_tag'].apply(lambda x: self.inlist_strip_lower(x))
        self.df['combined_tag'] = self.df['combined_tag'].apply(lambda x: self.inlist_remove_punctuation_marks(x))
        stop_words = set(stopwords.words('english'))
        self.df['combined_tag'] = self.df['combined_tag'].apply(lambda x: self.inlist_remove_stop_words(x, stop_words))
        
        # convert to text from tokens
        self.df['text'] = self.df['combined_tag'].apply(lambda x: ' '.join(word for word in x))

        # drop unncessary columns
        self.df.drop(['overview', 'overview2', 'title2', 'keywords2', 'production_companies2', 'production_countries2', 'spoken_languages2', 'cast2', 'crew2', 'character2', 
                  'genres2', 'tagline', 'tagline2', 'original_title'], axis=1, inplace=True)

        return self.df    

    # remove space between words in a token
    def inlist_removespace(self, x):
        return [str(t).replace(" ", "") for t in x]
    
    # strip and lower everything
    def inlist_strip_lower(self, x):
        return [str(t).strip().lower() for t in x]
    
    # get rid of punctuation marks
    def inlist_remove_punctuation_marks(self, x):
        return [re.sub(r'[^\w\s]', '', t) for t in x]
    
    # remove stop words
    def inlist_remove_stop_words(self, x, stop_words):
        return [t for t in x if t not in stop_words]

In [77]:
movies_comb = Preprocess(movies_comb).transform()

In [79]:
print(movies_comb.shape)
movies_comb.head(10)

(4809, 10)


Unnamed: 0,id,popularity,runtime,title,vote_average,vote_count,movie_id,release_year,combined_tag,text
0,19995,150.437577,162.0,avatar,7.2,11800,19995,2009.0,"[avatar, 22nd, century, paraplegic, marine, di...",avatar 22nd century paraplegic marine dispatch...
1,285,139.082615,169.0,pirates of the caribbean: at world's end,6.9,4500,285,2007.0,"[pirates, caribbean, worlds, end, captain, bar...",pirates caribbean worlds end captain barbossa ...
2,206647,107.376788,148.0,spectre,6.3,4466,206647,2015.0,"[spectre, cryptic, message, bonds, past, sends...",spectre cryptic message bonds past sends trail...
3,49026,112.31295,165.0,the dark knight rises,7.6,9106,49026,2012.0,"[dark, knight, rises, following, death, distri...",dark knight rises following death district att...
4,49529,43.926995,132.0,john carter,6.1,2124,49529,2012.0,"[john, carter, john, carter, warweary, former,...",john carter john carter warweary former milita...
5,559,115.699814,139.0,spider-man 3,5.9,3576,559,2007.0,"[spiderman, 3, seemingly, invincible, spiderma...",spiderman 3 seemingly invincible spiderman goe...
6,38757,48.681969,100.0,tangled,7.4,3330,38757,2010.0,"[tangled, kingdoms, wantedand, charmingbandit,...",tangled kingdoms wantedand charmingbandit flyn...
7,99861,134.279229,141.0,avengers: age of ultron,7.3,6767,99861,2015.0,"[avengers, age, ultron, tony, stark, tries, ju...",avengers age ultron tony stark tries jumpstart...
8,767,98.885637,153.0,harry potter and the half-blood prince,7.4,5293,767,2009.0,"[harry, potter, halfblood, prince, harry, begi...",harry potter halfblood prince harry begins six...
9,209112,155.790452,151.0,batman v superman: dawn of justice,5.7,7004,209112,2016.0,"[batman, v, superman, dawn, justice, fearing, ...",batman v superman dawn justice fearing actions...


## TF-IDF Vectorization

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
# generate embeddings
vectorizer = TfidfVectorizer(min_df=5, max_df=0.8) # Ignore words appearing in <5 movies and those appearing in >80% of movies
tfidf_matrix = vectorizer.fit_transform(movies_comb['text'])

# Dimensionality Reduction using SVD
svd = TruncatedSVD(n_components=500)
tfidf_reduced = svd.fit_transform(tfidf_matrix)
print(tfidf_matrix.shape)
print(tfidf_reduced.shape)

# Compute Cosine Similarity
cosine_sim = cosine_similarity(tfidf_reduced)
print(cosine_sim.shape)

(4809, 18570)
(4809, 500)
(4809, 4809)


## Recommender System

In [82]:
# Movie Recommendation Function
def recommend_movies(df, movie_title, num_recommendations=5):
    # Find movies that match the given title (partial match support)
    matching_movies = df[df['title'].str.contains(movie_title, case=False, na=False)]
    
    if matching_movies.empty:
        return "Movie not found!"
    
    recommendations = []
    for idx in matching_movies.index:
        similarity_scores = list(enumerate(cosine_sim[idx]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        top_movies = [df.iloc[i[0]]['title'] for i in similarity_scores[1:num_recommendations+1]]
        recommendations.extend(top_movies)
    
    return list(set(recommendations))  # Remove duplicates

# Example usage
print(recommend_movies(movies_comb, 'spider-man', 5))
print(recommend_movies(movies_comb, 'breaking upwards', 5))
print(recommend_movies(movies_comb, 'yeh jawaani hai deewani', 5))
print(recommend_movies(movies_comb, 'men in black', 5))
print(recommend_movies(movies_comb, 'narnia', 5))
print(recommend_movies(movies_comb, 'avengers', 5))

['iron man 2', 'the amazing spider-man 2', 'the amazing spider-man', 'spider-man 3', 'spider-man', 'hulk', 'ant-man', 'spider-man 2']
['love jones', 'the sleepwalker', 'revolutionary road', 'the out-of-towners', 'boynton beach club']
['the lunchbox', 'american desi', 'earth', 'veer-zaara', 'lage raho munna bhai']
['men in black', 'the day the earth stood still', 'timecrimes', 'the black hole', 'what planet are you from?', 'men in black ii', 'the tuxedo', 'e.t. the extra-terrestrial', 'impostor', 'the time machine', 'men in black 3']
['the chronicles of narnia: the voyage of the dawn treader', 'the wizard of oz', 'return to never land', 'cirque du soleil: worlds away', 'shrek 2', 'shrek', 'the chronicles of narnia: the lion, the witch and the wardrobe', 'the chronicles of narnia: prince caspian', 'return to the blue lagoon']
['the avengers', 'captain america: the first avenger', 'guardians of the galaxy', 'iron man 2', 'iron man 3', 'avengers: age of ultron', 'thor: the dark world', 'th

In [85]:
# store the dataset and the embeddings
import pickle

pickle.dump(movies_comb, open('../artifacts/movies_list.pkl', 'wb'))
pickle.dump(cosine_sim, open('../artifacts/movies_similarity.pkl', 'wb'))

In [86]:
import pandas as pd
print(pd.__version__)

2.2.3


In [87]:
!pip freeze | grep pandas

pandas==2.2.3


In [88]:
import joblib
import pickle
import streamlit as st
import requests
import os
import pandas as pd
import numpy as np

In [95]:
def fetch_poster(movie_id):
    base_url = "https://image.tmdb.org/t/p/original"
    api_key = "94b22dc37c19aa244c8e511128fce6cf"
    response = requests.get(f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}")
    data = response.json()
    print(data)
    poster_path = data['poster_path']
    poster_url = base_url + poster_path
    return poster_url

In [98]:
fetch_poster(movie_id=49026)

{'adult': False, 'backdrop_path': '/y2DB71C4nyIdMrANijz8mzvQtk6.jpg', 'belongs_to_collection': {'id': 263, 'name': 'The Dark Knight Collection', 'poster_path': '/ogyw5LTmL53dVxsppcy8Dlm30Fu.jpg', 'backdrop_path': '/xyhrCEdB4XRkelfVsqXeUZ6rLHi.jpg'}, 'budget': 250000000, 'genres': [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}], 'homepage': 'http://www.thedarkknightrises.com/', 'id': 49026, 'imdb_id': 'tt1345836', 'origin_country': ['GB', 'US'], 'original_language': 'en', 'original_title': 'The Dark Knight Rises', 'overview': "Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a ci

'https://image.tmdb.org/t/p/original/hr0L2aueqlP2BYUblTTjmtn0hw4.jpg'