In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\prade\Downloads\tmdb_movies_with_cleaned_text.csv")

In [2]:
# Step 2: Create the 'description_tagline' column by combining 'overview' and 'tagline'
df['description_tagline'] = df['overview'].fillna('') + ' ' + df['tagline'].fillna('')

# Step 3: TF-IDF for Description & Tagline
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description_tagline'])

# Step 4: Binary Encoding for Keywords
mlb_keywords = MultiLabelBinarizer()
binary_keywords = mlb_keywords.fit_transform(df['keywords'].apply(lambda x: x.split(', ') if pd.notna(x) else []))

# Step 5: Binary Encoding for Genres
mlb_genres = MultiLabelBinarizer()
binary_genres = mlb_genres.fit_transform(df['genres'].apply(lambda x: x.split(', ') if pd.notna(x) else []))

# Step 6: Binary Encoding for Director
mlb_director = MultiLabelBinarizer()
binary_director = mlb_director.fit_transform(df['director'].apply(lambda x: [x] if pd.notna(x) else []))

# Step 7: Binary Encoding for Cast
mlb_cast = MultiLabelBinarizer()
binary_cast = mlb_cast.fit_transform(df['cast'].apply(lambda x: x.split(', ') if pd.notna(x) else []))

In [3]:
# Cosine Similarity Function for Binary Vectors
def cosine_sim_matrix(binary_matrix):
    return cosine_similarity(binary_matrix)

# Step 8: Calculate pairwise cosine similarities
tfidf_similarity = cosine_similarity(tfidf_matrix)
keywords_similarity = cosine_sim_matrix(binary_keywords)
genres_similarity = cosine_sim_matrix(binary_genres)
director_similarity = cosine_sim_matrix(binary_director)
cast_similarity = cosine_sim_matrix(binary_cast)

# Step 9: Combine similarities for all features
combined_similarity = (tfidf_similarity + keywords_similarity + genres_similarity +
                       director_similarity + cast_similarity) / 5 

In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# Function to create a "query movie" based on input features
def create_query_movie(genres, cast, director, keywords, overview, tagline):
    """Creates a temporary 'movie' from the input features."""
    description_tagline = overview + ' ' + tagline

    # Transform the textual input (overview + tagline) into the TF-IDF space
    tfidf_query_vector = tfidf_vectorizer.transform([description_tagline])

    # Binary encode categorical inputs (genres, cast, director, keywords)
    binary_genres_query = mlb_genres.transform([genres.split(', ')])
    binary_cast_query = mlb_cast.transform([cast.split(', ')])
    binary_director_query = mlb_director.transform([[director]])  # Wrap in a list for consistency
    binary_keywords_query = mlb_keywords.transform([keywords.split(', ')])

    return tfidf_query_vector, binary_genres_query, binary_cast_query, binary_director_query, binary_keywords_query

# Function to calculate combined similarity of the query movie with all movies in the dataset
def calculate_combined_similarity_query(tfidf_query_vector, binary_genres_query, binary_cast_query, 
                                        binary_director_query, binary_keywords_query):
    """Calculates combined similarity between query movie and the dataset."""
    # Calculate similarities for each feature
    tfidf_similarity_query = cosine_similarity(tfidf_query_vector, tfidf_matrix).flatten()
    genres_similarity_query = cosine_sim_matrix(binary_genres_query).flatten()
    cast_similarity_query = cosine_sim_matrix(binary_cast_query).flatten()
    director_similarity_query = cosine_sim_matrix(binary_director_query).flatten()
    keywords_similarity_query = cosine_sim_matrix(binary_keywords_query).flatten()

    # Combine similarities (you can adjust weights for each feature)
    combined_similarity_query = (tfidf_similarity_query + genres_similarity_query +
                                 cast_similarity_query + director_similarity_query +
                                 keywords_similarity_query) / 5  # Equal weight for all features
    
    return combined_similarity_query

# Function to get top N similar movies based on input features
def get_recommendations_by_features(genres, cast, director, keywords, overview, tagline, N=10):
    # Create a query movie based on input features
    tfidf_query_vector, binary_genres_query, binary_cast_query, binary_director_query, binary_keywords_query = \
        create_query_movie(genres, cast, director, keywords, overview, tagline)

    # Calculate combined similarity of the query movie with all movies in the dataset
    combined_similarity_query = calculate_combined_similarity_query(tfidf_query_vector, binary_genres_query,
                                                                    binary_cast_query, binary_director_query,
                                                                    binary_keywords_query)
    
    # Get the indices of the top N most similar movies (excluding the query movie itself)
    sim_scores = list(enumerate(combined_similarity_query))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores[:N]]  # Get top N indices
    
    # Ensure movie_ids are returned as integers, not floats
    return df['movie_id'].iloc[movie_indices].astype(int).tolist()

# Example usage: Get top 10 similar movies based on user input
genres_input = input("Enter Genre : \n")
cast_input = input("Enter Cast : \n")
director_input = input("Enter Director : \n")
keywords_input = input("Enter Keyword : \n")
overview_input = input("Enter Overview: \n")
tagline_input = input("Enter Tagline: \n")

# Get recommendations
top_N_similar_movies = get_recommendations_by_features(genres_input, cast_input, director_input, 
                                                       keywords_input, overview_input, tagline_input, N=10)

# Display recommendations
if top_N_similar_movies:
    print("Recommended movie IDs:", top_N_similar_movies)
else:
    print("No recommendations found.")


Enter Genre : 
 Action, Adventure
Enter Cast : 
 Robert Downey Jr., Chris Hemsworth
Enter Director : 
 Joss Whedon
Enter Keyword : 
 superhero, alien invasion
Enter Overview: 
 Earth's mightiest heroes must come together.
Enter Tagline: 
 Some assembly required.


Recommended movie IDs: [24428, 99861, 173, 118340, 9824, 607, 285, 9341, 9678, 10481]


In [20]:
import pandas as pd

# Load the credits dataset
credits_df = pd.read_csv(r"C:\Proj\ML\Recommendation system\Dataset\tmdb_5000_credits.csv")

# Convert 'id' column to numeric if necessary
credits_df['movie_id'] = pd.to_numeric(credits_df['movie_id'], errors='coerce')

def get_movie_titles(movie_ids):
    titles = {}
    for movie_id in movie_ids:
        # Find the row where 'id' matches the provided movie_id
        movie_row = credits_df[credits_df['movie_id'] == movie_id]
        
        # Check if the movie was found
        if not movie_row.empty:
            titles[movie_id] = movie_row['title'].values[0]  # Store the title
        else:
            titles[movie_id] = "Movie ID not found."
    return titles

# Example: Get titles for a list of movie_ids
top_N_similar_movies

titles = get_movie_titles(top_N_similar_movies)

# Print the results
for movie_id, title in titles.items():
    print(f"Movie ID: {movie_id}, Title: {title}")


Movie ID: 24428, Title: The Avengers
Movie ID: 99861, Title: Avengers: Age of Ultron
Movie ID: 173, Title: 20,000 Leagues Under the Sea
Movie ID: 118340, Title: Guardians of the Galaxy
Movie ID: 9824, Title: Mystery Men
Movie ID: 607, Title: Men in Black
Movie ID: 285, Title: Pirates of the Caribbean: At World's End
Movie ID: 9341, Title: The Core
Movie ID: 9678, Title: Little Nicky
Movie ID: 10481, Title: 102 Dalmatians


In [8]:
import pandas as pd

# Load the dataset
data = pd.read_csv(r"C:\Users\prade\Downloads\merged_movies_with_ratings.csv")  # Update with the correct path

# Define WR calculation function
def calculate_weighted_rating(v, R, m, C):
    return (v / (v + m) * R) + (m / (v + m) * C)

def get_movie_features(movie_ids):
    # Filter movies by the given IDs
    selected_movies = data[data['movie_id'].isin(movie_ids)]
    
    # Ensure we have all 10 movies
    if len(selected_movies) < 10:
        raise ValueError("Some movie IDs were not found in the dataset or fewer than 10 were provided.")
    
    # Sort movies to match the input order
    selected_movies = selected_movies.set_index('movie_id').loc[movie_ids]
    
    # Collect votes and ratings for the selected movies
    votes = selected_movies['imdb_rating'].values.tolist()
    ratings = selected_movies['imdb_votes'].values.tolist()
    
    # Calculate the mean rating (C) for the entire dataset
    C = data['imdb_rating'].mean()  # Use the correct column name for average rating
    
    # Define minimum votes required (m)
    m = 100  # Adjust as per your requirements
    
    # Compute WR for this group
    v = sum(votes) / len(votes)  # Average vote count
    R = sum(ratings) / len(ratings)  # Average rating
    WR = calculate_weighted_rating(v, R, m, C)
    
    # Return the final 21-element list
    return votes + ratings + [WR]

# Example usage
movie_ids = [24428, 99861, 173, 118340, 9824, 607, 285, 9341, 9678, 10481]  # Replace with actual movie IDs
output_features = get_movie_features(movie_ids)
print("Output Features:", output_features)


Output Features: [8.0, 7.3, 7.2, 8.0, 6.1, 7.3, 7.1, 5.5, 5.3, 4.8, 1476021, 935534, 37794, 1288082, 71215, 622306, 702557, 108420, 113645, 39983, 33696.62958303375]
