Data Source - [Netflix Data](https://www.kaggle.com/datasets/shivamb/netflix-shows)

The purpose of this analysis is to find the following:
1. Data Cleaning & Prep
- Handle missing values (e.g., director, cast, country).
- Convert date_added from string to datetime format.
- Extract useful features (e.g., month/year added, duration in minutes).

2. Exploratory Data Analysis (EDA)
- Content Distribution: Movies vs. TV shows over time.
- Release Trends: When were most shows/movies added to Netflix?
- Country Analysis: Which countries produce the most content?
- Ratings Analysis: What’s the most common rating (TV-MA, PG-13, etc.)?

3. Visualizations (Use Matplotlib/Seaborn or Plotly)
- 📈 Bar Chart: Number of Movies vs. TV Shows by year.
- 🌍 Map Visualization: Countries producing the most content (using geopandas or Plotly).
- 📅 Time Series Plot: Monthly additions of content over the years.
- 📊 Pie Chart: Distribution of ratings (TV-MA, PG-13, etc.).

4. More Challenge
- Recommendation System (Basic): Suggest similar content based on genre.

In [None]:
# Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print('Happy Coding 😊')

In [None]:
data = pd.read_csv('./netflix_titles.csv') # importing the dataset

### Data Cleaning

In [None]:
data.head() # Displaying the first 5 rows of the dataset

In [None]:
data.sample(15) # Displaying 15 random rows of the dataset

In [None]:
data.isnull().sum() # checking for null values

In [None]:
data[(data['director'].isnull()) & (data['type'] != 'TV Show')] # checking for null values in the director column

In [None]:
print(data['type'].unique()) # Displaying unique values in the type column

In [None]:
data.loc[data['duration'].isnull(), 'rating'] = 'TV-MA' # The rating of the TV shows with missing duration is set to TV-MA

In [None]:
data.loc[data['title'] == 'Louis C.K. 2017', 'duration'] = '74 min' # The duration of the movie is set to 74 min
data.loc[data['title'] == 'Louis C.K.: Hilarious', 'duration'] = '84 min' # The duration of the movie is set to 84 min
data.loc[data['title'] == 'Louis C.K.: Live at the Comedy Store', 'duration'] = '66 min' # The duration of the movie is set to 66 min

In [None]:
data['director'] = data['director'].fillna('Unknown') # filling the null values with 'Unknown'
data['country'] = data['country'].fillna('Unknown') # filling the null values with 'Unknown'
data['cast'] = data['cast'].fillna('Unknown') # filling the null values with 'Unknown'

In [None]:
data['rating'] = data['rating'].fillna('Not Rated') # filling the null values with '

In [None]:
data[data['date_added'].isnull()] # checking for null values in the date_added column

In [None]:
data['date_added'] = pd.to_datetime(data['date_added'].str.strip(), format="%B %d, %Y", errors='coerce') # converting the date_added column to datetime format

In [None]:
data['date_added'].dtype # checking the data type of the date_added column

In [None]:
data.isnull().sum() # checking for null values

In [None]:
# data.to_csv('netflix_file.csv', index=False)

## Exploratory Data Analysis (EDA)

##### Movies vs. TV shows over time.

In [None]:
data[['date_added', 'type']] # checking for null values in the date_added column

In [None]:
# Extract the year from 'date_added'
data['year_added'] = data['date_added'].dt.year

# Group by 'year_added' and 'type' to count the number of entries
content_trend = data.groupby(['year_added', 'type']).size().reset_index(name='count')

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=content_trend, x='year_added', y='count', hue='type', marker='o', palette='crest')

# Customize the plot
plt.title('Trend of Movies and TV Shows Added to Netflix Over the Years', fontsize=16)
plt.xlabel('Year Added', fontsize=12)
plt.ylabel('Number of Titles', fontsize=12)
plt.xticks(rotation=45)
plt.legend(loc='upper left')
plt.grid(axis='y', linestyle='--', alpha=0.3)

# Show the plot
plt.tight_layout()
plt.show()

##### When were most shows/movies added to Netflix?

In [None]:
content_trend = data.groupby(['year_added', 'type']).size().reset_index(name='count')

In [None]:
content_trend

In [None]:
# Extract the year from 'date_added'
data['year_added'] = data['date_added'].dt.year
# Find the year with the most movies
most_movies_year = data[data['type'] == 'Movie']['year_added'].value_counts().idxmax()

# Find the year with the most TV shows
most_shows_year = data[data['type'] == 'TV Show']['year_added'].value_counts().idxmax()

print(f"The year with the most movies added is: {most_movies_year} with {data[data['type'] == 'Movie']['year_added'].value_counts().max()} movies")
print(f"The year with the most TV shows added is: {most_shows_year} with {data[data['type'] == 'TV Show']['year_added'].value_counts().max()} TV shows")

##### Which countries produce the most content?

In [None]:
data['country'].value_counts()

In [None]:
# Get the top 10 countries producing the most content
top_countries = data['country'].value_counts().head(10).reset_index()
top_countries.columns = ['country', 'count']

# Create a bar chart
plt.figure(figsize=(12, 8))
barplot = sns.barplot(data=top_countries, x='count', y='country', palette='crest')

# Add data labels to each bar
for index, row in top_countries.iterrows():
    plt.text(row['count'] + 1, index, str(row['count']), va = 'center', fontsize=10)

# Customize the chart
plt.title('Top 10 Countries Producing the Most Content on Netflix', fontsize=16)
plt.xlabel('Number of Contents', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()

##### What’s the most common rating (TV-MA, PG-13, etc.)

In [None]:
top_ratings = data['rating'].value_counts().reset_index()
top_ratings.columns = ['rating', 'count']
top_ratings

In [None]:
# Get top 10 ratings and calculate 'Others'
top_10_ratings = data['rating'].value_counts().nlargest(10)
others_count = data['rating'].value_counts()[10:].sum()

# Combine top 10 and others into a new Series
ratings_with_others = pd.concat([top_10_ratings, pd.Series({'Others': others_count})])

# Create DataFrame for plotting
plot_data = ratings_with_others.reset_index()
plot_data.columns = ['rating', 'count']

# Create a pie chart with a larger figure size
plt.figure(figsize=(12, 6))

# Calculate percentages
total = plot_data['count'].sum()
percentages = (plot_data['count'] / total * 100).round(1)

# Create simpler labels for the pie slices
labels = plot_data['rating']

# Create pie chart with custom colors and labels
patches, texts, autotexts = plt.pie(plot_data['count'], 
    labels=labels,
    colors=sns.color_palette('crest', n_colors=len(plot_data)),
    autopct='%1.0f%%',
    pctdistance=0.90,
    labeldistance=1.1,
    textprops={'fontsize': 8}
    )

# Add a legend with detailed informationz
legend_labels = [f"{rating} ({count:,} - {pct}%)" 
    for rating, count, pct in zip(plot_data['rating'],plot_data['count'], percentages)]
plt.legend(patches, legend_labels, 
    title="Rating Distribution",
    loc="center left",
    bbox_to_anchor=(1, 0, 0.5, 1))

# Add title
plt.title('Distribution of Ratings on Netflix', fontsize=16, pad=16)

# Ensure the pie is circular
plt.axis('equal')

# Adjust layout to prevent legend cropping
plt.tight_layout()
plt.show()

##### Number of Movies vs. TV Shows by year.

In [None]:
# Create a line plot
plt.figure(figsize=(12, 6))
sns.barplot(data=content_trend, x='year_added', y='count', hue='type', palette='crest')

# Customize the plot
plt.title('Number of Movies and TV Shows Added to Netflix by Year', fontsize=16)
plt.xlabel('Year Added', fontsize=12)
plt.ylabel('Number of Titles', fontsize=12)
# plt.xticks(rotation=45)
plt.legend(loc='upper left')
plt.grid(axis='y', linestyle='--', alpha=0.3)

# Show the plot
plt.tight_layout()
plt.show()

##### Most popular genres on Netflix

In [None]:
data

In [None]:
# Split the 'listed_in' column into individual genres and count occurrences
genre_counts = data['listed_in'].str.split(',').explode().str.strip().value_counts()
# Plot the top 10 genres
plt.figure(figsize=(12, 6))
sns.barplot(x=genre_counts.head(10).values, y=genre_counts.head(10).index, palette='crest')
plt.title('Top 10 Most Popular Genres on Netflix', fontsize=16)
plt.xlabel('Number of Movies', fontsize=12)
plt.ylabel('Genres', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

##### Movie Recommendation System

In [None]:
from fuzzywuzzy import process

In [None]:
# Step 1: Preprocess the data
# Split the 'listed_in' column into individual genres
data['genres'] = data['listed_in'].str.split(',').apply(lambda x: [genre.strip() for genre in x] if isinstance(x, list) else [])

# Flatten the list of genres to get all unique genres
unique_genres = set([genre for genres in data['genres'] for genre in genres])

# Step 2: Function to match user input to closest genre
def match_genre(user_input):
    # Use fuzzy matching to find the closest genre
    matched_genre, score = process.extractOne(user_input, unique_genres)
    return matched_genre if score > 50 else None  # Return None if no good match is found

# Step 3: Recommend movies based on the matched genre
def recommend_movies(user_input, num_recommendations=10):
    # Match the user's input to a genre
    matched_genre = match_genre(user_input)
    if not matched_genre:
        return f"No matching genre found for '{user_input}'. Please try again."

    # Filter movies that belong to the matched genre
    recommended_movies = data[data['genres'].apply(lambda genres: matched_genre in genres)]

    # Return the top recommendations with title, genres, and description
    return recommended_movies[['title', 'listed_in', 'description']].head(num_recommendations)



In [None]:
# Example usage
user_input = input("What type of movie would you like to watch? (e.g., 'Action', 'Drama', 'Comedy'): ")
recommendations = recommend_movies(user_input)

# Display the recommendations
if isinstance(recommendations, str):
    print(recommendations)  # Print the error message if no match is found
else:
    # Use Pandas' Styler to display the DataFrame with better formatting
    styled_recommendations = recommendations.style.set_properties(**{
        'background-color': '#f9f9f9',
        'border': '1px solid black',
        'color': 'black',
        'text-align': 'left'
    }).set_table_styles([{
        'selector': 'th',
        'props': [('background-color', '#4CAF90'), ('color', 'white'), ('font-weight', 'bold')]
    }])
    display(styled_recommendations)