In [1]:
import numpy as np
import pandas as pd
import json

# Load datasets
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

In [2]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [3]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
# Merge datasets on title
movies = movies.merge(credits, on="title")

In [5]:
# Select relevant columns
movies = movies[["title", "genres", "keywords", "id", "overview", "cast", "crew"]]

In [6]:
# Drop rows with missing values
movies.dropna(inplace=True)

In [7]:
# Check for duplicated rows
movies.duplicated().sum()

0

In [8]:
# Function to extract genre names
def convert(obj):
    try:
        genres = []
        for i in json.loads(obj):
            genres.append(i["name"].lower())
        return genres
    except json.JSONDecodeError:
        return []

In [9]:
# Process 'genres' and 'keywords' columns
movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)

In [10]:
# Function to extract top 3 cast members
def convert_cast(obj):
    try:
        casts = []
        for i in json.loads(obj)[:3]:
            casts.append(i["name"].lower())
        return casts
    except (json.JSONDecodeError, KeyError):
        return []

In [11]:
movies["cast"] = movies["cast"].apply(convert_cast)

In [12]:
# Function to extract the director
def fetch_director(obj):
    try:
        for i in json.loads(obj):
            if i.get("job", "").lower() == "director":
                return [i["name"].lower()]
        return ""
    except json.JSONDecodeError:
        return ""

In [13]:
movies["crew"] = movies["crew"].apply(fetch_director)

In [14]:
# Convert 'overview' into a list of words
def split_overview(text):
    return text.lower().split() if isinstance(text, str) else []

movies["overview"] = movies["overview"].apply(split_overview)

In [15]:
movies.head()

Unnamed: 0,title,genres,keywords,id,overview,cast,crew
0,Avatar,"[action, adventure, fantasy, science fiction]","[culture clash, future, space war, space colon...",19995,"[in, the, 22nd, century,, a, paraplegic, marin...","[sam worthington, zoe saldana, sigourney weaver]",[james cameron]
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drug abuse, exotic island, east india ...",285,"[captain, barbossa,, long, believed, to, be, d...","[johnny depp, orlando bloom, keira knightley]",[gore verbinski]
2,Spectre,"[action, adventure, crime]","[spy, based on novel, secret agent, sequel, mi...",206647,"[a, cryptic, message, from, bond’s, past, send...","[daniel craig, christoph waltz, léa seydoux]",[sam mendes]
3,The Dark Knight Rises,"[action, crime, drama, thriller]","[dc comics, crime fighter, terrorist, secret i...",49026,"[following, the, death, of, district, attorney...","[christian bale, michael caine, gary oldman]",[christopher nolan]
4,John Carter,"[action, adventure, science fiction]","[based on novel, mars, medallion, space travel...",49529,"[john, carter, is, a, war-weary,, former, mili...","[taylor kitsch, lynn collins, samantha morton]",[andrew stanton]


In [16]:
# Remove whitespace characters for each column
def remove_whitespace(arr):
    for col in arr:
        movies[col] = movies[col].apply(lambda x: [i.replace(" ", "") for i in x])

remove_whitespace(["overview", "cast", "genres", "keywords", "crew"])

In [17]:
movies.head()

Unnamed: 0,title,genres,keywords,id,overview,cast,crew
0,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...",19995,"[in, the, 22nd, century,, a, paraplegic, marin...","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron]
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...",285,"[captain, barbossa,, long, believed, to, be, d...","[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski]
2,Spectre,"[action, adventure, crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",206647,"[a, cryptic, message, from, bond’s, past, send...","[danielcraig, christophwaltz, léaseydoux]",[sammendes]
3,The Dark Knight Rises,"[action, crime, drama, thriller]","[dccomics, crimefighter, terrorist, secretiden...",49026,"[following, the, death, of, district, attorney...","[christianbale, michaelcaine, garyoldman]",[christophernolan]
4,John Carter,"[action, adventure, sciencefiction]","[basedonnovel, mars, medallion, spacetravel, p...",49529,"[john, carter, is, a, war-weary,, former, mili...","[taylorkitsch, lynncollins, samanthamorton]",[andrewstanton]


In [18]:
# Combine features into a single column
def collapse(features):
    return " ".join(features)

movies["tags"] = movies["cast"] + movies["crew"] + movies["genres"] + movies["keywords"] + movies["overview"]
movies["tags"] = movies["tags"].apply(collapse)

In [19]:
movies = movies[["id", "title", "tags"]]

In [20]:
movies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,samworthington zoesaldana sigourneyweaver jame...
1,285,Pirates of the Caribbean: At World's End,johnnydepp orlandobloom keiraknightley gorever...
2,206647,Spectre,danielcraig christophwaltz léaseydoux sammende...
3,49026,The Dark Knight Rises,christianbale michaelcaine garyoldman christop...
4,49529,John Carter,taylorkitsch lynncollins samanthamorton andrew...


In [21]:
# Create the vectorization model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
vectorized_features = vectorizer.fit_transform(movies["tags"]).toarray()

In [22]:
# Compute cosine similarity
similarity = cosine_similarity(vectorized_features)

In [23]:
# Recommendation function
def recommend(movie):
    if movie not in movies["title"].values:
        print(f"Movie '{movie}' not found in the dataset.")
        return

    movie_index = movies[movies["title"] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movie_list:
        print(movies.iloc[i[0]].title)


In [24]:
# Example recommendation
recommend("The Dark Knight Rises")

The Dark Knight
Batman Returns
Batman Begins
Batman Forever
Batman


In [25]:
import pickle
import os

# Dump the model in chunks of 50MB
def save_model_in_chunks(model, filename, chunk_size_mb=50):
    if not os.path.exists("models"):
        os.makedirs("models")
    
    chunk_size = chunk_size_mb * 1024 * 1024
    bytes_out = pickle.dumps(model)
    total_chunks = (len(bytes_out) + chunk_size - 1) // chunk_size

    for i in range(total_chunks):
        chunk_filename = os.path.join("models", f"{filename}.part{i+1}")
        with open(chunk_filename, "wb") as f:
            f.write(bytes_out[i * chunk_size : (i + 1) * chunk_size])

In [26]:
save_model_in_chunks(movies, "movies", chunk_size_mb=20)
save_model_in_chunks(similarity,"similarity", chunk_size_mb=20)