In [None]:
import pandas as pd
file =pd.read_csv("netflix_data.csv")
df = pd.DataFrame(file)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6403 entries, 0 to 6402
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   N_id             6403 non-null   int64  
 1   Title            6403 non-null   object 
 2   Main Genre       6403 non-null   object 
 3   Sub Genres       6403 non-null   object 
 4   Release Year     6402 non-null   float64
 5   Maturity Rating  6403 non-null   object 
 6   Original Audio   3767 non-null   object 
 7   Recommendations  6392 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 400.3+ KB
None


In [23]:
print(df.head())

     N_id                           Title Main Genre  \
0  215309      Ace Ventura: Pet Detective     Comedy   
1  215318  Ace Ventura: When Nature Calls     Comedy   
2  217258               The Addams Family     Comedy   
3  217303            Addams Family Values     Comedy   
4  235527                       Agneepath      Drama   

                                Sub Genres  Release Year Maturity Rating  \
0                      Comedy, Mystery, US        1994.0               A   
1           Comedy, Action & Adventure, US        1995.0         U/A 16+   
2                               Comedy, US        1991.0         U/A 13+   
3                               Comedy, US        1993.0         U/A 13+   
4  Hindi-Language, Bollywood, Crime, Drama        1990.0         U/A 16+   

                                      Original Audio  \
0                          Hindi, English [Original]   
1                          Hindi, English [Original]   
2  English [Original], Hindi, English 

In [24]:
print(df.isnull().sum())

N_id                  0
Title                 0
Main Genre            0
Sub Genres            0
Release Year          1
Maturity Rating       0
Original Audio     2636
Recommendations      11
dtype: int64


In [25]:
print(df.describe())

               N_id  Release Year
count  6.403000e+03   6402.000000
mean   7.971379e+07   2019.077320
std    7.159805e+06      6.094716
min    2.153090e+05   1962.000000
25%    8.022148e+07   2018.000000
50%    8.122164e+07   2021.000000
75%    8.151819e+07   2022.000000
max    8.199405e+07   2025.000000


In [26]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Drop missing values in "Recommendations"
df = df.dropna(subset=["Recommendations"])

# Fill missing values properly
df["Release Year"] = df["Release Year"].fillna(df["Release Year"].median())
df["Original Audio"] = df["Original Audio"].fillna("Unknown")

# Combine "Main Genre" and "Sub Genres"
df["genres"] = df["Main Genre"] + ", " + df["Sub Genres"]
df["genres"] = df["genres"].str.replace(", $", "", regex=True)  # Remove trailing commas
df["genres"] = df["genres"].str.split(", ")  # Convert to list

# Explode genres
df_exploded = df.explode("genres")

# Apply One-Hot Encoding to genres
df_encoded = pd.get_dummies(df_exploded, columns=["genres"])

# Drop duplicates before grouping
df_encoded = df_encoded.drop_duplicates(subset=["N_id"])

# Ensure genre features are included
genre_columns = [col for col in df_encoded.columns if col.startswith("genres_")]
columns_to_keep = ["N_id", "Release Year"] + genre_columns

# Filter df_encoded to keep only required columns
df_final = df_encoded[columns_to_keep].copy()

# Normalize "Release Year"
scaler = MinMaxScaler()
df_final[["Release Year"]] = scaler.fit_transform(df_final[["Release Year"]])

# Display dataset
print(df_final.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6392 entries, 0 to 6402
Columns: 186 entries, N_id to genres_based on a Play
dtypes: bool(184), float64(1), int64(1)
memory usage: 1.3 MB
None


In [27]:
print(df_final.head())

     N_id  Release Year  genres_Action  genres_Action & Adventure  \
0  215309      0.507937          False                      False   
1  215318      0.523810          False                      False   
2  217258      0.460317          False                      False   
3  217303      0.492063          False                      False   
4  235527      0.444444          False                      False   

   genres_Action Anime  genres_African  genres_Anime  genres_Anime Features  \
0                False           False         False                  False   
1                False           False         False                  False   
2                False           False         False                  False   
3                False           False         False                  False   
4                False           False         False                  False   

   genres_Anime Series  genres_Anime based on Books  ...  genres_US  \
0                False                 

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Drop N_id before similarity calculation
features = df_final.drop(columns=["N_id"])

# Compute cosine similarity
similarity_matrix = cosine_similarity(features)

# Store similarity matrix in a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=df_final["N_id"], columns=df_final["N_id"])

# Function to recommend movies
def recommend_movies(movie_id, num_recommendations=5):
    if movie_id not in similarity_df.index:
        return "Movie ID not found in dataset."

    # Get similarity scores for the given movie
    similar_movies = similarity_df[movie_id].sort_values(ascending=False).iloc[1:num_recommendations+1]

    return similar_movies

# Test recommendation for a random movie ID
test_movie_id = df_final["N_id"].iloc[0]
recommendations = recommend_movies(test_movie_id)
print("Recommended Movies:", recommendations)


Recommended Movies: N_id
215309      1.0
80157083    1.0
70026792    1.0
70027007    1.0
70153404    1.0
Name: 215309, dtype: float64


In [29]:
# Merge N_id with movie titles
id_to_title = df.set_index("N_id")["Title"]  # Create a mapping from N_id to Title

def recommend_movies(movie_id, num_recommendations=5):
    if movie_id not in similarity_df.index:
        return "Movie ID not found in dataset."

    # Get similarity scores for the given movie
    similar_movies = similarity_df[movie_id].sort_values(ascending=False).iloc[1:num_recommendations+1]

    # Map movie IDs to titles
    recommended_titles = id_to_title.loc[similar_movies.index].values

    return list(recommended_titles)

# Test recommendation for a random movie ID
test_movie_id = df_final["N_id"].iloc[0]
recommendations = recommend_movies(test_movie_id)
print("Recommended Movies:", recommendations)


Recommended Movies: ['Ace Ventura: Pet Detective', 'Kabhi Haa Kabhi Naa', 'Love on Delivery', 'The Mask', 'Friends']


In [30]:
def recommend_movies_weighted(movie_id, num_recommendations=5, weight_release_year=0.3):
    if movie_id not in similarity_df.index:
        return "Movie ID not found in dataset."

    # Get similarity scores
    similar_movies = similarity_df[movie_id].sort_values(ascending=False).iloc[1:]

    # Convert N_id to title
    similar_movies = similar_movies.to_frame(name="Similarity Score")
    similar_movies["Title"] = id_to_title.loc[similar_movies.index]

    # Add release year and normalize it
    similar_movies["Release Year"] = df.set_index("N_id")["Release Year"].loc[similar_movies.index]
    similar_movies["Release Year"] = (similar_movies["Release Year"] - similar_movies["Release Year"].min()) / \
                                     (similar_movies["Release Year"].max() - similar_movies["Release Year"].min())

    # Compute final score by combining similarity and recency
    similar_movies["Final Score"] = (1 - weight_release_year) * similar_movies["Similarity Score"] + \
                                    weight_release_year * similar_movies["Release Year"]

    # Get top N recommendations
    top_recommendations = similar_movies.sort_values(by="Final Score", ascending=False).head(num_recommendations)

    return top_recommendations[["Title", "Final Score"]]

# Test weighted recommendations
test_movie_id = df_final["N_id"].iloc[0]
weighted_recommendations = recommend_movies_weighted(test_movie_id)
print("Recommended Movies with Weighted Scores:\n", weighted_recommendations)


Recommended Movies with Weighted Scores:
                                        Title  Final Score
N_id                                                     
81928828   Ari Shaffir: America’s Sweetheart     0.965466
81630894                       The Love Scam     0.965466
81697662  Gabriel Iglesias: Legend of Fluffy     0.965466
81937110                       Knight Flower     0.962420
81500431                         Inheritance     0.962420


In [31]:
import ipywidgets as widgets
from IPython.display import display

# Create a dropdown of movie titles
movie_dropdown = widgets.Dropdown(
    options=df["Title"].unique(),
    description="Movie:",
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='60%')
)

# Define an interactive function
def display_recommendations(movie_title):
    # Get the corresponding movie ID
    movie_id = df[df["Title"] == movie_title]["N_id"].values[0]

    # Get recommendations
    recommended_df = recommend_movies_weighted(movie_id)

    # Display recommendations
    print(f"\nTop Recommendations for '{movie_title}':\n")
    print(recommended_df.to_string(index=False))

# Create an interactive output
widgets.interactive(display_recommendations, movie_title=movie_dropdown)

interactive(children=(Dropdown(description='Movie:', layout=Layout(width='60%'), options=('Ace Ventura: Pet De…

In [32]:
# Select a movie title for testing
test_movie_title = "Ace Ventura: Pet Detective"

# Get movie ID
test_movie_id = df[df["Title"] == test_movie_title]["N_id"].values[0]

# Run standard recommendations
print("\n🔹 Standard Recommendations:")
print(recommend_movies(test_movie_id))

# Run weighted recommendations
print("\n🔹 Weighted Recommendations:")
print(recommend_movies_weighted(test_movie_id))


🔹 Standard Recommendations:
['Ace Ventura: Pet Detective', 'Kabhi Haa Kabhi Naa', 'Love on Delivery', 'The Mask', 'Friends']

🔹 Weighted Recommendations:
                                       Title  Final Score
N_id                                                     
81928828   Ari Shaffir: America’s Sweetheart     0.965466
81630894                       The Love Scam     0.965466
81697662  Gabriel Iglesias: Legend of Fluffy     0.965466
81937110                       Knight Flower     0.962420
81500431                         Inheritance     0.962420


In [33]:
test_movies = ["The Mask", "Kabhi Haa Kabhi Naa", "Love on Delivery", "The Addams Family", "Ace Ventura 2"]

for movie in test_movies:
    print(f"\n🎬 Testing recommendations for: {movie}")

    # Check if the movie exists in the dataset
    if movie not in df["Title"].values:
        print(f"⚠️ Movie '{movie}' not found in the dataset. Skipping...")
        continue  # Skip to the next movie

    test_id = df[df["Title"] == movie]["N_id"].values[0]
    print(recommend_movies_weighted(test_id))



🎬 Testing recommendations for: The Mask
                                       Title  Final Score
N_id                                                     
81928828   Ari Shaffir: America’s Sweetheart     0.965466
81630894                       The Love Scam     0.965466
81697662  Gabriel Iglesias: Legend of Fluffy     0.965466
81937110                       Knight Flower     0.962420
81500431                         Inheritance     0.962420

🎬 Testing recommendations for: Kabhi Haa Kabhi Naa
                                       Title  Final Score
N_id                                                     
81928828   Ari Shaffir: America’s Sweetheart     0.965466
81630894                       The Love Scam     0.965466
81697662  Gabriel Iglesias: Legend of Fluffy     0.965466
81937110                       Knight Flower     0.962420
81500431                         Inheritance     0.962420

🎬 Testing recommendations for: Love on Delivery
                                       Title  

In [34]:
print(recommend_movies_weighted(999999))  # Invalid Movie ID

Movie ID not found in dataset.


In [35]:
old_movie = "Agneepath"  # Example: a 1990 movie
test_id = df[df["Title"] == old_movie]["N_id"].values[0]
print(recommend_movies_weighted(test_id))

                     Title  Final Score
N_id                                   
81595930   Public Disorder     0.953342
81971071     Black Warrant     0.953342
81759233             Asura     0.953342
81907881      Hell for You     0.953342
81566970  The Breakthrough     0.953342


In [36]:
popular_movie = "The Mask"
test_id = df[df["Title"] == popular_movie]["N_id"].values[0]
print(recommend_movies_weighted(test_id))


                                       Title  Final Score
N_id                                                     
81928828   Ari Shaffir: America’s Sweetheart     0.965466
81630894                       The Love Scam     0.965466
81697662  Gabriel Iglesias: Legend of Fluffy     0.965466
81937110                       Knight Flower     0.962420
81500431                         Inheritance     0.962420


In [41]:
# Run the interactive dropdown test (previously created)
widgets.interactive(display_recommendations, movie_title=movie_dropdown)

interactive(children=(Dropdown(description='Movie:', index=13, layout=Layout(width='60%'), options=('Ace Ventu…