# üìä Netflix EDA + Recommendation System
End-to-end data analysis and content-based recommender system built with Python.

In [None]:

# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Settings
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10,5)


In [None]:

# Replace this with the actual path if needed
df = pd.read_csv("netflix_titles.csv")

# Quick look
df.head()


In [None]:

# Check null values
df.isnull().sum()

# Fill missing values where possible
df["director"] = df["director"].fillna("Unknown")
df["cast"] = df["cast"].fillna("Unknown")
df["country"] = df["country"].fillna("Unknown")
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")

# Drop duplicates
df.drop_duplicates(inplace=True)


In [None]:

sns.countplot(data=df, x="type", palette="Set2")
plt.title("Count of Movies vs TV Shows")
plt.show()


In [None]:

df["country"].value_counts().head(10).plot(kind="bar", color="coral")
plt.title("Top 10 Countries with Most Titles")
plt.ylabel("Count")
plt.show()


In [None]:

df["year_added"] = df["date_added"].dt.year
df["year_added"].value_counts().sort_index().plot(kind="line", marker="o")
plt.title("Number of Titles Added Over Time")
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()


In [None]:

# TF-IDF Matrix
tfidf = TfidfVectorizer(stop_words="english")
df["description"] = df["description"].fillna("")
tfidf_matrix = tfidf.fit_transform(df["description"])

# Similarity scores
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Reset index for mapping
indices = pd.Series(df.index, index=df["title"]).drop_duplicates()


In [None]:

def recommend(title, cosine_sim=cosine_sim):
    if title not in indices:
        print(f"'{title}' not found in dataset!")
        return []
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 5
    movie_indices = [i[0] for i in sim_scores]
    return df["title"].iloc[movie_indices]


In [None]:

recommend("Narcos")


In [None]:

print("üîç Insights:")
print("- Netflix has more Movies than TV Shows.")
print("- US & India are top content producers.")
print("- Peak content addition happened around 2018-2020.")
print("- Content-based recommender suggests similar titles using descriptions.")
