In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load the datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Display first few rows of each dataset
print("Movies dataset shape:", movies.shape)
print("Credits dataset shape:", credits.shape)

print("\nMovies dataset preview:")
movies.head(2)


Movies dataset shape: (4803, 20)
Credits dataset shape: (4803, 4)

Movies dataset preview:


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [3]:
print("Credits dataset preview:")
credits.head(2)


Credits dataset preview:


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [4]:
# Check column names in credits dataset
credits.columns


Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [5]:
# Merge datasets on 'title'
movies = movies.merge(credits, on='title')

# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Check for missing values
print("Missing values in each column:")
movies.isnull().sum()


Missing values in each column:


movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [6]:
# Drop rows with missing values
movies = movies.dropna()
print(f"Dataset shape after dropping missing values: {movies.shape}")


Dataset shape after dropping missing values: (4806, 7)


In [7]:
# Helper function to convert string representation of list to Python list
def convert(text):
    try:
        return ast.literal_eval(text)
    except:
        return []  # Return empty list if parsing fails

# Parse stringified lists/dictionaries
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['crew'] = movies['crew'].apply(convert)


In [8]:
# Extract genre names
def extract_genres(genres_list):
    return [genre['name'].lower().replace(' ', '') for genre in genres_list]

# Extract keywords
def extract_keywords(keywords_list):
    return [keyword['name'].lower().replace(' ', '') for keyword in keywords_list]

# Extract top 3 cast members
def extract_cast(cast_list):
    top_cast = []
    for i, cast in enumerate(cast_list):
        if i < 3:  # Only include top 3 cast members
            top_cast.append(cast['name'].lower().replace(' ', ''))
        else:
            break
    return top_cast

# Extract director
def extract_director(crew_list):
    directors = []
    for crew_member in crew_list:
        if crew_member['job'] == 'Director':
            directors.append(crew_member['name'].lower().replace(' ', ''))
    return directors

# Apply extraction functions
movies['genres'] = movies['genres'].apply(extract_genres)
movies['keywords'] = movies['keywords'].apply(extract_keywords)
movies['cast'] = movies['cast'].apply(extract_cast)
movies['crew'] = movies['crew'].apply(extract_director)


In [9]:
# Preview the extracted features
movies[['title', 'genres', 'keywords', 'cast', 'crew']].head(2)


Unnamed: 0,title,genres,keywords,cast,crew
0,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron]
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski]


In [10]:
# Create tags by combining all features
def create_tags(row):
    # Process overview text
    overview = row['overview'].lower() if isinstance(row['overview'], str) else ''
    overview_words = overview.split()
    # Keep only longer words that might be more meaningful
    overview_words = [word for word in overview_words if len(word) > 4]
    overview = ' '.join(overview_words)
    
    # Combine all features with appropriate weights (repeating important features)
    return ' '.join(row['genres']) + ' ' + \
           ' '.join(row['keywords']) + ' ' + \
           ' '.join(row['cast']) + ' ' + \
           ' '.join(row['cast']) + ' ' + \
           ' '.join(row['crew']) + ' ' + \
           ' '.join(row['crew']) + ' ' + \
           overview

movies['tags'] = movies.apply(create_tags, axis=1)

# Display the tags for a few movies
movies[['title', 'tags']].head(2)


Unnamed: 0,title,tags
0,Avatar,action adventure fantasy sciencefiction cultur...
1,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drugabuse exoti...


In [11]:
# Keep only required columns for recommendation
movies_final = movies[['movie_id', 'title', 'tags']]

# Vectorize the tags
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies_final['tags']).toarray()

# Check the shape of vectors
print(f"Shape of vectors: {vectors.shape}")


Shape of vectors: (4806, 5000)


In [12]:
# Calculate cosine similarity matrix
similarity = cosine_similarity(vectors)

# Print the shape of similarity matrix
print(f"Shape of similarity matrix: {similarity.shape}")


Shape of similarity matrix: (4806, 4806)


In [13]:
# Create a Series of movie titles for easy lookup
movie_titles = pd.Series(movies_final['title'].values, index=movies_final.index)


In [14]:
def get_recommendations(title, num_recommendations=5):
    try:
        # Find the index of the movie in our dataset
        idx = movies_final[movies_final['title'] == title].index[0]
        
        # Get similarity scores for all movies with this one
        similarity_scores = list(enumerate(similarity[idx]))
        
        # Sort by similarity score (highest first)
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        
        # Get top N similar movies (excluding the movie itself)
        similarity_scores = similarity_scores[1:num_recommendations+1]
        
        # Get movie indices
        movie_indices = [i[0] for i in similarity_scores]
        
        # Return movie titles
        return movies_final['title'].iloc[movie_indices].values
    except IndexError:
        return ["Movie not found in database. Please check the title and try again."]

# Test the recommendation function
print(get_recommendations('The Dark Knight'))


['The Dark Knight Rises' 'Batman Begins' 'Batman Returns' 'Batman Forever'
 'Batman']


In [None]:
# Create a dropdown widget with all movie titles
try:
    # Try using ipywidgets for interactive UI
    movie_dropdown = widgets.Dropdown(
        options=sorted(movies_final['title'].tolist()),
        description='Select Movie:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='80%')
    )

    # Slider for number of recommendations
    num_recommendations = widgets.IntSlider(
        min=1,
        max=10,
        step=1,
        value=5,
        description='Number of recommendations:',
        style={'description_width': 'initial'}
    )

    # Create a button to trigger recommendations
    recommend_button = widgets.Button(
        description='Get Recommendations',
        button_style='success',
        tooltip='Click to get movie recommendations'
    )

    # Create an output widget to display results
    output = widgets.Output()

    # Style for the result boxes
    box_style = """
    <style>
    .movie-box {
        padding: 10px;
        margin: 5px;
        background-color: #f0f0f0;
        border-radius: 5px;
        border: 1px solid #ddd;
    }
    .movie-title {
        font-weight: bold;
        font-size: 16px;
    }
    .recommendation-header {
        font-size: 18px;
        font-weight: bold;
        margin-bottom: 10px;
    }
    </style>
    """

    # Function to handle button click
    def on_button_click(b):
        with output:
            clear_output()
            display(widgets.HTML(box_style))
            
            selected_movie = movie_dropdown.value
            num_recs = num_recommendations.value
            
            if not selected_movie:
                print("Please select a movie from the dropdown.")
                return
            
            recommendations = get_recommendations(selected_movie, num_recs)
            
            display(widgets.HTML(f"<div class='recommendation-header'>Top {num_recs} Movies Similar to '{selected_movie}':</div>"))
            
            if "Movie not found" in recommendations[0]:
                display(widgets.HTML(f"<div class='movie-box'>{recommendations[0]}</div>"))
            else:
                for i, title in enumerate(recommendations, 1):
                    display(widgets.HTML(f"<div class='movie-box'><span class='movie-title'>{i}. {title}</span></div>"))

    # Attach the click handler to the button
    recommend_button.on_click(on_button_click)

    # Display the UI
    display(widgets.VBox([
        widgets.HTML("<h2>Movie Recommendation System</h2>"),
        widgets.HTML("<p>Select a movie from the dropdown and click the button to get similar movie recommendations.</p>"),
        movie_dropdown,
        num_recommendations,
        recommend_button,
        output
    ]))
    
except Exception as e:
    print(f"ipywidgets error: {e}")
    print("\nFalling back to basic input method...\n")
    
    # Alternative: Use basic Python input for movie selection
    print("Movie Recommendation System")
    print("==========================\n")
    
    # Display first 5 movies as examples
    print("Example movies in the database:")
    for i, title in enumerate(sorted(movies_final['title'].tolist())[:5], 1):
        print(f"{i}. {title}")
    print("... and many more!\n")
    
    # Get user input
    movie_title = input("Enter a movie title: ")
    num_recs = 5
    
    try:
        num_recs_input = input("Enter number of recommendations (1-10) [default=5]: ")
        if num_recs_input.strip():
            num_recs = int(num_recs_input)
            num_recs = max(1, min(num_recs, 10))  # Keep between 1-10
    except:
        num_recs = 5
        
    # Get and display recommendations
    recommendations = get_recommendations(movie_title, num_recs)
    
    print(f"\nTop {num_recs} Movies Similar to '{movie_title}':")
    print("="*(20 + len(movie_title)))
    
    if "Movie not found" in recommendations[0]:
        print(recommendations[0])
    else:
        for i, title in enumerate(recommendations, 1):
            print(f"{i}. {title}")


VBox(children=(HTML(value='<h2>Movie Recommendation System</h2>'), HTML(value='<p>Select a movie from the drop…

In [16]:
# Save model components
pickle.dump(movies_final, open('movies_data.pkl', 'wb'))
pickle.dump(similarity, open('similarity_matrix.pkl', 'wb'))

print("Model components saved successfully!")


Model components saved successfully!


In [17]:
# Example of loading saved model components
# Uncomment the code below to load the saved model components

# loaded_movies = pickle.load(open('movies_data.pkl', 'rb'))
# loaded_similarity = pickle.load(open('similarity_matrix.pkl', 'rb'))
# print("Model components loaded successfully!")
