In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style("whitegrid")

# 1. Dataset load karna
df = pd.read_csv("Netflix Dataset.csv")

# 2. Zaruri columns ka naam badalna
df.rename(columns={'Category': 'Type_of_Content', 'Type': 'Genre'}, inplace=True)

# 3. Date Cleaning aur 'Release_Year' Extraction (Fix for ValueError)
# a. Extra spaces hatao (Yeh woh fix hai jo pehle error de raha tha)
df['Release_Date'] = df['Release_Date'].astype(str).str.strip()

# b. Missing dates wali rows ko hata do
df.dropna(subset=['Release_Date'], inplace=True)

# c. 'Release_Date' ko datetime format mein convert karna
df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce')

# d. Jo dates abhi bhi convert nahi ho paaye (NaT) unhein drop kar do
df.dropna(subset=['Release_Date'], inplace=True)

# e. 'Release_Year' column nikalna
df['Release_Year'] = df['Release_Date'].dt.year.astype(int)

print("--- Step 1: Setup and Data Cleaning Complete ---")
print(f"Total rows for analysis: {len(df)}")

--- Step 1: Setup and Data Cleaning Complete ---
Total rows for analysis: 7779


In [7]:
# --- Objective 1: Movies vs. TV Shows Trends ---

# 'Release_Year' aur 'Type_of_Content' ke hisaab se count karna
content_by_year = df.groupby(['Release_Year', 'Type_of_Content']).size().reset_index(name='Count')

# Visualization: Dual Line Chart
plt.figure(figsize=(14, 7))
sns.lineplot(
    data=content_by_year[content_by_year['Release_Year'] >= 2010],
    x='Release_Year',
    y='Count',
    hue='Type_of_Content',
    marker='o',
    palette=['#E50914', '#000000'] # Netflix theme colors
)

plt.title('Netflix Content Addition Trend: Movies vs. TV Shows (2010-2021)', fontsize=16)
plt.xlabel('Release Year', fontsize=12)
plt.ylabel('Number of Titles Added', fontsize=12)
plt.legend(title='Content Type')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()

# Chart ko file mein save karna
plt.savefig('1_Content_Trend_Movies_vs_TVShows.png')
plt.close()

print("\n--- Step 2: Objective 1 Complete (Chart saved as 1_Content_Trend_Movies_vs_TVShows.png) ---")


--- Step 2: Objective 1 Complete (Chart saved as 1_Content_Trend_Movies_vs_TVShows.png) ---


In [15]:
# --- Objective 2a: Top 10 Overall Genres ---

# 'Genre' column ko split aur explode karna
genres_exploded = df['Genre'].str.split(',').explode().str.strip()

# Overall Top 10 Genres ki sankhaya nikalna
genre_counts = genres_exploded.value_counts().head(10)

print("\nTop 10 Overall Genres on Netflix:")
print(genre_counts.to_markdown(numalign="left", stralign="left"))

# Visualization: Horizontal Bar Chart
plt.figure(figsize=(12, 8))
sns.barplot(x=genre_counts.values, y=genre_counts.index, hue=genre_counts.index, palette='viridis', legend=False)

plt.title('Top 10 Overall Genres on Netflix', fontsize=16)
plt.xlabel('Number of Titles (Count)', fontsize=12)
plt.ylabel('Genre', fontsize=12)
plt.tight_layout()

plt.savefig('2a_Top_10_Genres_Overall.png')
plt.close()
print("Objective 2a: Top 10 Genres chart saved as 2a_Top_10_Genres_Overall.png")


# --- Objective 2b: Top 5 Genre Trend Over Time ---

# Top 5 genres ki list nikalna
top_5_genres = genre_counts.head(5).index.tolist()

# Dataframe ko 'Genre' aur 'Release_Year' ke saath explode karna
df_genres_yearly = (
    df[['Release_Year', 'Genre']]
    .assign(Single_Genre=df['Genre'].str.split(','))
    .explode('Single_Genre')
)
df_genres_yearly['Single_Genre'] = df_genres_yearly['Single_Genre'].str.strip()

# Sirf Top 5 Genres ko filter karna
df_top_5_yearly = df_genres_yearly[df_genres_yearly['Single_Genre'].isin(top_5_genres)]

# Saalon aur Genres ke hisaab se count karna
genre_trend = df_top_5_yearly.groupby(['Release_Year', 'Single_Genre']).size().reset_index(name='Count')

# Visualization: Line Chart for Trend
plt.figure(figsize=(14, 7))
sns.lineplot(
    data=genre_trend[genre_trend['Release_Year'] >= 2014],
    x='Release_Year',
    y='Count',
    hue='Single_Genre',
    marker='o',
    dashes=False
)

plt.title('Annual Addition Trend for Top 5 Genres (2014-2021)', fontsize=16)
plt.xlabel('Release Year', fontsize=12)
plt.ylabel('Number of Titles Added', fontsize=12)
plt.legend(title='Genre')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig('2b_Top_5_Genre_Trend.png')
plt.close()

print("Objective 2b: Top 5 Genre Trend chart saved as 2b_Top_5_Genre_Trend.png")
print("\n--- Step 3: Objective 2 Complete ---")


Top 10 Overall Genres on Netflix:
| Genre                    | count   |
|:-------------------------|:--------|
| International Movies     | 2437    |
| Dramas                   | 2108    |
| Comedies                 | 1472    |
| International TV Shows   | 1198    |
| Documentaries            | 786     |
| Action & Adventure       | 721     |
| TV Dramas                | 703     |
| Independent Movies       | 675     |
| Children & Family Movies | 532     |
| Romantic Movies          | 531     |
Objective 2a: Top 10 Genres chart saved as 2a_Top_10_Genres_Overall.png
Objective 2b: Top 5 Genre Trend chart saved as 2b_Top_5_Genre_Trend.png

--- Step 3: Objective 2 Complete ---


In [13]:
# --- Objective 3: Country-wise Contributions ---

# Missing 'Country' rows ko hata do
df_country = df.dropna(subset=['Country']).copy()

# 'Country' column ko split aur explode karna
df_country_exploded = (
    df_country[['Country']]
    .assign(Single_Country=df_country['Country'].str.split(','))
    .explode('Single_Country')
)
df_country_exploded['Single_Country'] = df_country_exploded['Single_Country'].str.strip()

# Overall Top 10 Countries ki sankhaya nikalna
country_counts = df_country_exploded['Single_Country'].value_counts().head(10)

print("\nTop 10 Overall Contributing Countries:")
print(country_counts.to_markdown(numalign="left", stralign="left"))

# Visualization: Horizontal Bar Chart
plt.figure(figsize=(12, 8))
sns.barplot(x=country_counts.values, y=country_counts.index, hue=country_counts.index, palette='magma', legend=False)

plt.title('Top 10 Contributing Countries to Netflix Catalog', fontsize=16)
plt.xlabel('Number of Titles Contributed', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.tight_layout()

plt.savefig('3_Top_10_Countries.png')
plt.close()

print("Objective 3: Top 10 Countries chart saved as 3_Top_10_Countries.png")
print("\n--- Step 4: Objective 3 Complete ---")


Top 10 Overall Contributing Countries:
| Single_Country   | count   |
|:-----------------|:--------|
| United States    | 3292    |
| India            | 990     |
| United Kingdom   | 722     |
| Canada           | 412     |
| France           | 349     |
| Japan            | 286     |
| Spain            | 215     |
| South Korea      | 212     |
| Germany          | 199     |
| Mexico           | 154     |
Objective 3: Top 10 Countries chart saved as 3_Top_10_Countries.png

--- Step 4: Objective 3 Complete ---


In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100 # Adjust figure resolution

# --- 1. SETUP AND DATA CLEANING ---

# 1. Dataset load karna
df = pd.read_csv("Netflix Dataset.csv")

# 2. Zaruri columns ka naam badalna
df.rename(columns={'Category': 'Type_of_Content', 'Type': 'Genre'}, inplace=True)

# 3. Date Cleaning aur 'Release_Year' Extraction (Fix for ValueError)
# a. 'Release_Date' se extra spaces hatao
df['Release_Date'] = df['Release_Date'].astype(str).str.strip()

# b. Missing dates wali rows ko hata do
df.dropna(subset=['Release_Date'], inplace=True)

# c. 'Release_Date' ko datetime format mein convert karna (errors='coerce' use kiya, taaki koi unusual format ho toh skip ho jaaye)
df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce')

# d. Jo dates abhi bhi convert nahi ho paaye (NaT) unhein drop kar do
df.dropna(subset=['Release_Date'], inplace=True)

# e. 'Release_Year' column nikalna
df['Release_Year'] = df['Release_Date'].dt.year.astype(int)

print("--- Data Setup and Cleaning Complete ---")

# --- 2. OBJECTIVE 1: MOVIES VS. TV SHOWS TREND ---

# 'Release_Year' aur 'Type_of_Content' ke hisaab se count karna
content_by_year = df.groupby(['Release_Year', 'Type_of_Content']).size().reset_index(name='Count')

# Visualization: Dual Line Chart
plt.figure(figsize=(14, 7))
sns.lineplot(
    data=content_by_year[content_by_year['Release_Year'] >= 2010],
    x='Release_Year',
    y='Count',
    hue='Type_of_Content',
    marker='o',
    palette=['#E50914', '#000000'] 
)

plt.title('1. Netflix Content Addition Trend: Movies vs. TV Shows (2010-2021)', fontsize=16)
plt.xlabel('Release Year', fontsize=12)
plt.ylabel('Number of Titles Added', fontsize=12)
plt.legend(title='Content Type')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('1_Content_Trend_Movies_vs_TVShows.png')
plt.close()
print("Chart 1: 1_Content_Trend_Movies_vs_TVShows.png saved.")


# --- 3. OBJECTIVE 2: GENRE POPULARITY AND TREND ---

# a. Overall Top 10 Genres
genres_exploded = df['Genre'].str.split(',').explode().str.strip()
genre_counts = genres_exploded.value_counts().head(10)

# Visualization: Horizontal Bar Chart (FIX: hue and legend=False added for FutureWarning)
plt.figure(figsize=(12, 8))
sns.barplot(
    x=genre_counts.values, 
    y=genre_counts.index, 
    hue=genre_counts.index, 
    palette='viridis', 
    legend=False
)

plt.title('2a. Top 10 Overall Genres on Netflix', fontsize=16)
plt.xlabel('Number of Titles (Count)', fontsize=12)
plt.ylabel('Genre', fontsize=12)
plt.tight_layout()
plt.savefig('2a_Top_10_Genres_Overall.png')
plt.close()
print("Chart 2a: 2a_Top_10_Genres_Overall.png saved.")

# b. Top 5 Genre Trend Over Time
top_5_genres = genre_counts.head(5).index.tolist()
df_genres_yearly = (
    df[['Release_Year', 'Genre']]
    .assign(Single_Genre=df['Genre'].str.split(','))
    .explode('Single_Genre')
)
df_genres_yearly['Single_Genre'] = df_genres_yearly['Single_Genre'].str.strip()
df_top_5_yearly = df_genres_yearly[df_genres_yearly['Single_Genre'].isin(top_5_genres)]
genre_trend = df_top_5_yearly.groupby(['Release_Year', 'Single_Genre']).size().reset_index(name='Count')

# Visualization: Line Chart for Trend
plt.figure(figsize=(14, 7))
sns.lineplot(
    data=genre_trend[genre_trend['Release_Year'] >= 2014],
    x='Release_Year',
    y='Count',
    hue='Single_Genre',
    marker='o',
    dashes=False
)

plt.title('2b. Annual Addition Trend for Top 5 Genres (2014-2021)', fontsize=16)
plt.xlabel('Release Year', fontsize=12)
plt.ylabel('Number of Titles Added', fontsize=12)
plt.legend(title='Genre')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('2b_Top_5_Genre_Trend.png')
plt.close()
print("Chart 2b: 2b_Top_5_Genre_Trend.png saved.")


# --- 4. OBJECTIVE 3: COUNTRY-WISE CONTRIBUTIONS ---

# Missing 'Country' rows ko hata do
df_country = df.dropna(subset=['Country']).copy()

# 'Country' column ko split aur explode karna
df_country_exploded = (
    df_country[['Country']]
    .assign(Single_Country=df_country['Country'].str.split(','))
    .explode('Single_Country')
)
df_country_exploded['Single_Country'] = df_country_exploded['Single_Country'].str.strip()
country_counts = df_country_exploded['Single_Country'].value_counts().head(10)

# Visualization: Horizontal Bar Chart (FIX: hue and legend=False added for FutureWarning)
plt.figure(figsize=(12, 8))
sns.barplot(
    x=country_counts.values, 
    y=country_counts.index, 
    hue=country_counts.index, 
    palette='magma', 
    legend=False
)

plt.title('3. Top 10 Contributing Countries to Netflix Catalog', fontsize=16)
plt.xlabel('Number of Titles Contributed', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.tight_layout()
plt.savefig('3_Top_10_Countries.png')
plt.close()

print("Chart 3: 3_Top_10_Countries.png saved.")
print("\n--- Project Analysis Complete! All 4 charts saved to your directory. ---")

--- Data Setup and Cleaning Complete ---
Chart 1: 1_Content_Trend_Movies_vs_TVShows.png saved.
Chart 2a: 2a_Top_10_Genres_Overall.png saved.
Chart 2b: 2b_Top_5_Genre_Trend.png saved.
Chart 3: 3_Top_10_Countries.png saved.

--- Project Analysis Complete! All 4 charts saved to your directory. ---
