In [None]:
# 📌 Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 🌐 Set Seaborn style
sns.set(style="darkgrid")

# 📂 Load Dataset
df = pd.read_csv("netflix_titles.csv")  # Make sure this file is in the same folder

# 🧭 Explore Dataset
print("Shape of dataset:", df.shape)
display(df.head())  # Show first 5 rows in notebook
display(df.info())
display(df.describe())

# 🧹 Data Cleaning
# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Drop rows with missing values in key columns
df = df.dropna(subset=['type', 'title', 'country', 'release_year', 'rating'])

# 📊 Visualization 1: Count of Movies vs TV Shows
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='type', palette='Set2')
plt.title("Count of Movies vs TV Shows")
plt.xlabel("Type")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# 📊 Visualization 2: Top 10 Countries by Number of Titles
top_countries = df['country'].value_counts().head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')
plt.title("Top 10 Countries with Most Netflix Titles")
plt.xlabel("Number of Titles")
plt.ylabel("Country")
plt.tight_layout()
plt.show()

# 📊 Visualization 3: Content Added Over the Years
df['date_added'] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year

yearly_content = df['year_added'].value_counts().sort_index()

plt.figure(figsize=(10,5))
sns.lineplot(x=yearly_content.index, y=yearly_content.values, marker='o')
plt.title("Netflix Content Added Over the Years")
plt.xlabel("Year")
plt.ylabel("Number of Titles Added")
plt.tight_layout()
plt.show()

# 📊 Visualization 4: Rating Distribution
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='rating', order=df['rating'].value_counts().index, palette='coolwarm')
plt.title("Content Ratings Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 📊 Visualization 5: Distribution of Movie Durations
# Filter only movies
movies_df = df[df['type'] == 'Movie'].copy()

# Remove ' min' and convert to int
movies_df['duration_minutes'] = movies_df['duration'].str.replace(' min', '', regex=False).astype(int)

plt.figure(figsize=(10,5))
sns.histplot(movies_df['duration_minutes'], bins=30, kde=True, color='purple')
plt.title("Distribution of Movie Durations")
plt.xlabel("Duration (minutes)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
