# Data Understanding and Pre-processing

## 1. Importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")


### Reading Data

In [None]:
df = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
df.head()

Data Exploration

In [None]:
df.info()

### --> Looking for any missing values

In [None]:
df.isna().sum()

In [None]:
# Plot missing values in columns
import missingno #Python library for the exploratory visualization of missing data #install using !pip install missingno

color = ['dimgrey', 'dimgrey', 'dimgrey', 'darkblue', 'blue', 'blue', 'lightblue', 'dimgrey', 'dimgrey', 'dimgrey', 'dimgrey', 'dimgrey']

missingno.bar(df,fontsize=10,color=color,figsize=(10,5))

plt.title('COLUMN WISE MISSING VALUES',fontsize=20)

- The columns director, cast, nation, and date_added have missing values.
- We may drop the director and cast columns as we are unable to haphazardly fill in their missing values.
- We may use the mean and mode, which are the most frequent values, to fill in the least amount of missing entries in the country and date_added, rating.


# Data Wrangling Operations 

### --> Managing absent values


In [None]:
df['country'] = df['country'].fillna(df['country'].mode()[0])

df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])

df['rating'] = df['rating'].fillna(df['country'].mode()[0])

In [None]:
df = df.dropna( how='any',subset=['cast', 'director'])

In [None]:
df.isna().sum()

- Every missing value in the dataset has been filled in or eliminated. There are no remaining missing values.


In [None]:
df.duplicated().sum()

- No values in the dataset are duplicates.


### --> clearing the data

Adding some new columns:
- listed_in - Genre
* Year Added - year_add
* Month Added - month_add
* Princial Country - country_main 

In [None]:
#Rename the 'listed_in' column as 'Genre' for easy understanding
df = df.rename(columns={"listed_in":"Genre"})

df['Genre'] = df['Genre'].apply(lambda x: x.split(",")[0])

df['Genre'].head()

In [None]:
df['year_add'] = df['date_added'].apply(lambda x: x.split(" ")[-1])

df['year_add'].head()

In [None]:
df['month_add'] = df['date_added'].apply(lambda x: x.split(" ")[0])

df['month_add'].head()

In [None]:
df['country_main'] = df['country'].apply(lambda x: x.split(",")[0])

df['country_main'].head()

In [None]:
df['type'].value_counts()

In [None]:
df['rating'].value_counts()

-- creating two new dataframes, one for the collection of films and the other for the collection of TV series:
* movie_df
* tv_df

In [None]:
movie_df = df[df['type'] == 'Movie']


tv_df = df[df['type'] == 'TV Show']

In [None]:
movie_df.head()

### --> An examination of the running time of films


In [None]:
# Step 1: Replace ' min' with an empty string
movie_df['duration'] = movie_df['duration'].str.replace(' min', '')

# Step 2: Convert the column to numeric, coercing errors to NaN
movie_df['duration'] = pd.to_numeric(movie_df['duration'], errors='coerce')

# Step 3 (option 1): Drop rows with NaN values in 'duration' column
movie_df = movie_df.dropna(subset=['duration'])

# Optional: Convert 'duration' to integers after handling NaNs
movie_df['duration'] = movie_df['duration'].astype(int)


print(movie_df['duration'])


# Descriptive Analytics 

### --> Movies Compared to TV Series

In [None]:

sb.set(style="whitegrid")
sb.countplot(x="type", data= df, palette="husl")

- Netflix has a greater selection of films than TV series.


### --> Analysis of Movie and TV Show Ratings


In [None]:
#MOVIES RATINGS
plt.figure(figsize=(12,10))
sb.set(style="dark")
sb.countplot(x="rating", data= movie_df, palette="muted", order=movie_df['rating'].value_counts().index[0:15])

- The 'TV-MA' classification is used in the majority of films.A television programme intended solely for mature audiences is rated "TV-MA" by the TV Parental Guidelines. 
- The 'TV-14' category, which is the second largest, denotes programming that might not be suitable for children under the age of 14.
- The highly appreciated 'R' grade comes in third place.A movie that has been classified R indicates that it contains content that may not be appropriate for viewers under the age of 17.


In [None]:
# TV SHOWS RATINGS
plt.figure(figsize=(12,10))
sb.set(style="ticks")
sb.countplot(x="rating", data=tv_df, palette="plasma", order=tv_df['rating'].value_counts().index[0:15])

- The majority of TV shows have a "TV-14" classification, indicating that the content may be unsuitable for viewers under the age of 14.
- 'TV-MA', a TV show with content intended solely for mature audiences, has the second-highest rating count.
- TV series have the fewest 'R' rated episodes.


### --> Annual Content Analysis

In [None]:
plt.figure(figsize=(12,10))
sb.set(style="dark")
sb.countplot(y="release_year", data= df, palette="plasma", order= df['release_year'].value_counts().index[0:15],hue=df['type'])

- It's evident that Netflix produced the most amount of content in 2017.
- A discernible increase in the amount of information released dates back to 2015.


In [None]:
sb.set(style="white")
sb.kdeplot(data=movie_df['duration'], shade=True)

- Consequently, a sizable portion of Netflix's film selections fall into the **75-120 minute range.


### --> trends in the length of films


In [None]:
duration_year = movie_df.groupby(['release_year']).mean()
duration_year = duration_year.sort_index()

plt.figure(figsize=(15,6))
sb.lineplot(x=duration_year.index, y=duration_year.duration.values)
plt.box(on=None)
plt.ylabel('Movie duration in minutes');
plt.xlabel('Year of released');
plt.title("Trends of Movie's Duration over the Years", fontsize=20, color='Blue');

- Movie runs from **1960 to 1965** averaged around **200 minutes; after 1965, runs got noticeably shorter.**
- We can see a steady trend in movie lengths starting in the year **1980**, with a running time of about **between 100 and 150 minutes.**


### --> Examination of TV Series with the highest amount of seasons


In [None]:
tv_df['duration']=tv_df['duration'].str.replace(' Season','')
tv_df['duration']=tv_df['duration'].str.replace('s','')
tv_df['duration']=tv_df['duration'].astype(str).astype(int)
tv_df['duration']

--> Titles of TV shows and the number of seasons they have:


In [None]:
#Extract the columns from tv_df
columns=['title','duration']
tv_shows = tv_df[columns]

In [None]:
#sort the dataframe by number of seasons
tv_shows = tv_shows.sort_values(by='duration',ascending=False)
tv_shows
top20 = tv_shows[0:20]
top20

In [None]:
plt.figure(figsize=(10,6))
top20.plot(kind='bar',x='title',y='duration', color='black')

- **Naruto and Supernatural** have the most seasons.


In [None]:
# TV SHOWS AND THEIR SEASONS
plt.figure(figsize=(8, 6))
labels=['1 Season', '2 Season', '3 Season']
_, _, texts = plt.pie(df.duration.value_counts()[:3], labels=labels, autopct='%1.2f%%', startangle=90, 
                      explode=(0.0, 0.1, 0.2), colors=['#FF6347', '#4682B4', '#8A2BE2'])
plt.axis('equal')
plt.title('Seasons Available on Netflix', fontsize=20, fontweight='bold');
for text in texts:
    text.set_color('white')

- We can analyse the chart and find that **35.04% of TV shows have only one season, 32.48% have two seasons, and 32.48% have three seasons**.

### --> Top nations for producing film content

In [None]:
plt.figure(figsize=(15,8))
sb.set(style="dark")
sb.countplot(x="country_main", data=movie_df, palette="plasma", order=movie_df['country_main'].value_counts().index[0:15])

- **United States** produces the most films, with ***India and UK*** coming in second and third.



### --> Top nations for producing TV show content


In [None]:
plt.figure(figsize=(18,8))
sb.set(style="white")
sb.countplot(x="country_main", data=tv_df, palette="plasma", order=tv_df['country_main'].value_counts().index[0:15])

- **South Korea, Japan, the United States, and the United Kingdom** produce the majority of the TV shows on Netflix.


### --> Recognising the stuff that is accessible in various nations


In [None]:
# Different Genres from Countries
columns=['Genre','country_main']
gen_country = df[columns]

In [None]:
gen_country.head()

In [None]:
gen_country['Genre'].value_counts()

In [None]:
size = gen_country['Genre'].value_counts().tolist()
size

In [None]:
# GENRES BY COUNTRIES
import squarify
plt.figure(figsize=(30,15))
squarify.plot(value= gen_country['Genre'], label= gen_country['country_main'], sizes= size, alpha=0.8)
plt.rc('font', size=8)
plt.show()

- The majority of content in the comedy and children's and family movie genres is produced in the United States.

### --> Contents published throughout years

In [None]:
release = df['release_year'].value_counts()
release = release.sort_index(ascending=True)

plt.figure(figsize=(9,7))
plt.plot(release[-11:-1])
plt.scatter(release[-11:-1].index, release[-11:-1].values, s=0.5*release[-11:-1].values, c='blue');
plt.box(on=None);
plt.xticks(rotation = 60)
plt.xticks(release[-11:-1].index);
plt.title('Number of Content Released by Year', color='blue', fontsize=20);

### --> Directors who have made the most amount of films


In [None]:
plt.figure(figsize=(10,8))
sb.barplot(y= movie_df.director.value_counts()[:10].sort_values().index, x=movie_df.director.value_counts()[:10].sort_values().values);
plt.title('Director with most number of movies', color='pink', fontsize=18)
plt.xticks(movie_df.director.value_counts()[:10].sort_values().values);
plt.xlabel('Number of Movies Released');


- **Highest number of films produced by Jan Suter and Raul Campos: 18** have been released on Netflix thus far.


In [None]:
plt.figure(figsize=(10,8))
sb.barplot(y= tv_df.director.value_counts()[:10].sort_values().index, x=tv_df.director.value_counts()[:10].sort_values().values);
plt.title('Director with most number of TV Shows', color='blue', fontsize=18)
plt.xticks(tv_df.director.value_counts()[:10].sort_values().values);
plt.xlabel('Number of Series Released');

- **The maximum number of TV shows: 3** were published on Netflix by director **Alastair Fothergill**.


### --> Netflix's Most Popular Genre

In [None]:
plt.figure(figsize=(18,10))
sb.barplot(x= df.Genre.value_counts()[:10].sort_values().index, y=df.Genre.value_counts()[:10].sort_values().values,palette='plasma');
plt.title('Most Popular Genre', color='green', fontsize=20)
plt.yticks(df.Genre.value_counts()[:10].sort_values().values);
plt.xlabel('GENRES');
plt.ylabel('Number of contents');

- **Netflix is aware of how much drama we all like! :p**

### --> Top Netflix actors and actresses


In [None]:
plt.figure(figsize=(18,14))
sb.barplot(y= df.cast.value_counts()[:15].sort_values().index, x=df.cast.value_counts()[:15].sort_values().values,palette='plasma');
plt.title('Top Actor/Actresses on Netflix', color='green', fontsize=30)
plt.xticks(df.cast.value_counts()[:10].sort_values().values);
plt.ylabel('Actor/Actresses', fontsize=25);
plt.xlabel('Content counts', fontsize=25);

- The actor **'Samuel West'** has the most Netflix films and TV series.

### --> The ideal month for content releases from filmmakers


- We can examine the months when the least amount of content is added to determine which months would be ideal for directors to release their work in order to capture the interest of a larger audience.


In [None]:
df_copy = df.copy()
net_date = df_copy[['date_added']]
net_date['year'] = net_date['date_added'].apply(lambda x : x.split(', ')[-1])
net_date['month'] = net_date['date_added'].apply(lambda x : x.lstrip().split(' ')[0])

month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df_copy = net_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T


In [None]:
plt.figure(figsize=(10, 7), dpi=200)
plt.pcolor(df_copy, cmap='coolwarm', edgecolors='black', linewidths=2) # heatmap
plt.xticks(np.arange(0.5, len(df_copy.columns), 1), df_copy.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(df_copy.index), 1), df_copy.index, fontsize=7, fontfamily='serif')

plt.title('Netflix Contents Update', fontsize=12, fontfamily='calibri', fontweight='bold')
cbar = plt.colorbar()
cbar.solids.set_edgecolor("face")

cbar.ax.tick_params(labelsize=8) 
cbar.ax.minorticks_on()
plt.show()

# Diagnostic Analytics

# Release Year vs. Movie Duration:

Examine the connection between a film's runtime and its year of release.

In [None]:
plt.figure(figsize=(15,8))
sb.scatterplot(x='release_year', y='duration', data=movie_df, hue='country_main', palette='viridis', alpha=0.7)
plt.title('Release Year vs. Movie Duration', fontsize=20)
plt.xlabel('Release Year')
plt.ylabel('Movie Duration (minutes)')
plt.legend(title='Country')
plt.show()

This scatter plot helps to diagnose trends in movie duration over the years and how it varies by country.

### Genre Popularity Over Time:

Analyse the shifts in the popularity of various genres throughout time.


In [None]:
genre_year = df.groupby(['release_year', 'Genre']).size().reset_index(name='counts')
plt.figure(figsize=(15,8))
sb.lineplot(x='release_year', y='counts', hue='Genre', data=genre_year, marker='o', palette='tab10')
plt.title('Genre Popularity Over Time', fontsize=20)
plt.xlabel('Release Year')
plt.ylabel('Number of Releases')
plt.legend(title='Genre')
plt.show()

This line plot analyses how audience tastes have changed over time by tracking the popularity of various genres.


### Country-wise Content Release Analysis:

Examine the trends in content release over time for the leading nations.

In [None]:
top_countries = df['country_main'].value_counts().nlargest(10).index
country_year = df[df['country_main'].isin(top_countries)].groupby(['release_year', 'country_main']).size().reset_index(name='counts')

plt.figure(figsize=(15,8))
sb.lineplot(x='release_year', y='counts', hue='country_main', data=country_year, marker='o', palette='tab20')
plt.title('Content Release Patterns of Top Countries Over the Years', fontsize=20)
plt.xlabel('Release Year')
plt.ylabel('Number of Releases')
plt.legend(title='Country')
plt.show()

Understanding regional trends in content production is aided by this line plot, which examines the patterns of content release for the top nations.

### Director Productivity Analysis:

Examine the senior directors' historical production over time.

In [None]:
top_directors = df['director'].value_counts().nlargest(10).index
director_year = df[df['director'].isin(top_directors)].groupby(['release_year', 'director']).size().reset_index(name='counts')

plt.figure(figsize=(15,8))
sb.lineplot(x='release_year', y='counts', hue='director', data=director_year, marker='o', palette='tab10')
plt.title('Productivity of Top Directors Over the Years', fontsize=20)
plt.xlabel('Release Year')
plt.ylabel('Number of Releases')
plt.legend(title='Director')
plt.show()

The production tendencies of eminent filmmakers are examined in this line plot across time, highlighting their contribution to Netflix's content collection.

# Predictive Analytics

Netflix Recommendation System

## Content Based Filtering

- The content of the film (actor, synopsis, director, genre, etc.) is used by this recommender system to determine how similar it is to other films. Next, suggested films are those that are most likely to be comparable.


![](https://miro.medium.com/max/998/1*O_GU8xLVlFx8WweIzKNCNw.png)

## Based on the plot description, recommender


- Based on the storyline descriptions of each movie, we will determine a similarity score for each one and make recommendations for other films based on that score. The **description** feature of our dataset contains the plot description.


In [None]:
df['description'].head()

- Every overview's word vector has to be converted.For every description, we will calculate the TF-IDF (Term Frequency-Inverse Document Frequency) vectors.Each word's total significance to the papers in which it appears is equal to TF * IDF.By doing this, the relevance of terms that often appear in plot overviews is diminished, and as a result, their impact in determining the final similarity score is decreased.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

- The cosine similarity score may be obtained straight from the dot product calculation because the TF-IDF vectorizer was utilised. Because it is quicker, we will instead utilise **linear_kernel()** from sklearn rather than cosine_similarities().

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

- Given a movie's title, we require a way to determine its index in our metadata DataFrame.

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

- Let's develop a function that returns a list of the ten most comparable movies when given a movie title as input.

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [None]:
get_recommendations('#realityhigh')

In [None]:
get_recommendations('PK')

- These recommendations are entirely dependent on the narrative. Since they are not very precise, we may attempt to enhance model performance by including other metrics.

## Recommender system with many metrics (Genre, Cast, Director) based



- We must identify the three most significant actors, the director, and the genres connected to the film from the genre, cast, and director features.


In [None]:
features=['Genre','director','cast','description','title']
filters = df[features]

In [None]:
#Cleaning the data by making all the words in lower case.
def clean_data(x):
        return str.lower(x.replace(" ", ""))

In [None]:
for feature in features:
    filters[feature] = filters[feature].apply(clean_data)
    
filters.head()

- Now that we have all the metadata we want to give our vectorizer in a string, we can make our "metadata soup".

In [None]:
def create_soup(x):
    return x['director'] + ' ' + x['cast'] + ' ' +x['Genre']+' '+ x['description']

In [None]:
filters['soup'] = filters.apply(create_soup, axis=1)

- The next actions are the same as those we took with our recommender based on a storyline description. The fact that we employ **CountVectorizer()** rather than TF-IDF is one significant distinction.



In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filters['soup'])

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
filters

In [None]:
# Reset index of our main DataFrame and construct reverse mapping as before
filters=filters.reset_index()
indices = pd.Series(filters.index, index=filters['title'])

In [None]:
def get_recommendations_new(title, cosine_sim=cosine_sim):
    title=title.replace(' ','').lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [None]:
get_recommendations_new('PK', cosine_sim2)

In [None]:
get_recommendations_new('Black panther', cosine_sim2)

In [None]:
get_recommendations_new('Naruto', cosine_sim2)