In [1]:
# Import necessary libraries
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle

In [2]:
# Function to print version of each required package
def print_package_versions():
    print("Package Versions:")
    print(f"Pandas: {pd.__version__}")
    print(f"Numpy: {np.__version__}")
    print(f"Scikit-Learn: {sklearn.__version__}")
    print("AST: Built-in")
    print("Pickle: Built-in")

print_package_versions()

Package Versions:
Pandas: 2.2.3
Numpy: 2.1.3
Scikit-Learn: 1.5.2
AST: Built-in
Pickle: Built-in


In [3]:
# Function to load and merge datasets
def load_data():
    movies_data = pd.read_csv(r"../data/tmdb_5000_movies.csv")
    credits_data = pd.read_csv(r"../data/tmdb_5000_credits.csv")
    return movies_data.merge(credits_data, on='title')
# Load the data and inspect
raw_data = load_data()
raw_data.head()  # Display the first few rows to check

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
# Function to extract names from genres and keywords
def extract_names_from_list_column(text):
    names = [item['name'] for item in ast.literal_eval(text)]
    return names

# Apply extraction to 'genres' and 'keywords' columns
raw_data["genres"] = raw_data["genres"].apply(extract_names_from_list_column)
raw_data["keywords"] = raw_data["keywords"].apply(extract_names_from_list_column)
raw_data[['title', 'genres', 'keywords']].head()  # Check the result

Unnamed: 0,title,genres,keywords
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


In [5]:
# Function to extract up to 10 cast names
def extract_cast_names(text, max_cast=10):
    names = [item['name'] for item in ast.literal_eval(text)[:max_cast]]
    return names

# Apply extraction to 'cast' column
raw_data['cast'] = raw_data['cast'].apply(extract_cast_names)
raw_data[['title', 'cast']].head()  # Check the result

Unnamed: 0,title,cast
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave..."
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ..."
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R..."
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A..."
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,..."


In [6]:
# Function to extract the director's name
def extract_director_name(text):
    for item in ast.literal_eval(text):
        if item['job'] == 'Director':
            return [item['name']]
    return []

# Apply extraction to 'crew' column
raw_data['crew'] = raw_data['crew'].apply(extract_director_name)
raw_data[['title', 'crew']].head()  # Check the result

Unnamed: 0,title,crew
0,Avatar,[James Cameron]
1,Pirates of the Caribbean: At World's End,[Gore Verbinski]
2,Spectre,[Sam Mendes]
3,The Dark Knight Rises,[Christopher Nolan]
4,John Carter,[Andrew Stanton]


In [7]:
# Function to remove spaces from each word in a list
def remove_spaces(words):
    return [word.replace(" ", "") for word in words]

# Apply to relevant columns for uniformity
raw_data['genres'] = raw_data['genres'].apply(remove_spaces)
raw_data['keywords'] = raw_data['keywords'].apply(remove_spaces)
raw_data['cast'] = raw_data['cast'].apply(remove_spaces)
raw_data['crew'] = raw_data['crew'].apply(remove_spaces)
raw_data[['title', 'genres', 'keywords', 'cast', 'crew']].head()  # Check the result

Unnamed: 0,title,genres,keywords,cast,crew
0,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


In [8]:
# Split overview column text into lists for merging
raw_data['overview'] = raw_data['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

# Combine relevant columns into a single 'tags' column
raw_data['tags'] = raw_data['overview'] + raw_data['genres'] + raw_data['keywords'] + raw_data['cast'] + raw_data['crew']

# Check the combined 'tags' column
raw_data[['title', 'tags']].head()

Unnamed: 0,title,tags
0,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [9]:
# Function to convert list of names into a comma-separated string
def list_to_string(names_list):
    return ", ".join(names_list)
# Apply the function to 'genres' column to convert the list to string
raw_data["genresString"] = raw_data["genres"].apply(list_to_string).head()

In [10]:
# Select relevant columns and prepare final dataset
final_data = raw_data[['movie_id', 'vote_average', 'genresString', 'title', 'tags']]
final_data['tags'] = final_data['tags'].apply(lambda x: " ".join(x))  # Convert list to single string
final_data['tags'] = final_data['tags'].apply(lambda x: x.lower()) # Lower the string
final_data.head()  # Check the final dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['tags'] = final_data['tags'].apply(lambda x: " ".join(x))  # Convert list to single string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['tags'] = final_data['tags'].apply(lambda x: x.lower()) # Lower the string


Unnamed: 0,movie_id,vote_average,genresString,title,tags
0,19995,7.2,"Action, Adventure, Fantasy, ScienceFiction",Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,6.9,"Adventure, Fantasy, Action",Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,6.3,"Action, Adventure, Crime",Spectre,a cryptic message from bond’s past sends him o...
3,49026,7.6,"Action, Crime, Drama, Thriller",The Dark Knight Rises,following the death of district attorney harve...
4,49529,6.1,"Action, Adventure, ScienceFiction",John Carter,"john carter is a war-weary, former military ca..."


In [11]:
# Initialize CountVectorizer and compute similarity matrix
vectorizer = CountVectorizer(max_features=6000, token_pattern=r'(?u)\b[a-zA-Z]+\b', stop_words="english")
movie_vectors = vectorizer.fit_transform(final_data['tags']).toarray()
similarity_matrix = cosine_similarity(movie_vectors)

# Check the similarity matrix shape and a sample
similarity_matrix.shape, similarity_matrix[:5, :5]  # Shape and a small sample

((4809, 4809),
 array([[1.        , 0.07702442, 0.05134962, 0.03310877, 0.13251783],
        [0.07702442, 1.        , 0.05405405, 0.01742626, 0.06974858],
        [0.05134962, 0.05405405, 1.        , 0.08713129, 0.11624764],
        [0.03310877, 0.01742626, 0.08713129, 1.        , 0.11992507],
        [0.13251783, 0.06974858, 0.11624764, 0.11992507, 1.        ]]))

In [12]:
# Define paths for saving in the 'model' folder
final_data_path = "../models/movie_list.pkl"
similarity_matrix_path = "../models/movie_similarity.pkl"

# Save processed data and similarity matrix to the specified paths
pickle.dump(final_data, open(final_data_path, 'wb'))
pickle.dump(similarity_matrix, open(similarity_matrix_path, 'wb'))

# Confirm data saved
print("Data saved in the model folder as movie_list.pkl and similarity.pkl")

Data saved in the model folder as movie_list.pkl and similarity.pkl
