In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
raw_data = pd.read_csv('netflix.csv')

## Let us review the data once

In [None]:
raw_data.info()

In [None]:
raw_data.head()

In [None]:
raw_data.isnull().sum()

## Challenges
1. Nested Data
    - cast
    - director
    - listed_in
    - country
2. Missing Values / Null Values
    - Director:   2634
    - Cast:       825
    - Country:    831
    - Date_Added: 10
    - Rating:     4
    - Duration:   3
3. Duration
    - Mins for Movies
    - #Seasons for TV Shows
4. Datatype
    - date_added

In [None]:
data = raw_data

## Challenge #1: Handling Nesting Data

In [None]:
data['cast'] = data['cast'].str.split(',')
data['director'] = data['director'].str.split(',')
data['listed_in'] = data['listed_in'].str.split(',')
data['country'] = data['country'].str.split(',')
data = data.explode('cast')
data = data.explode('director')
data = data.explode('listed_in')
data = data.explode('country')

In [None]:
data.info()

## Challenge #2: Handling Missing Values / Null Values
- Here we can observe the size of the dataset has increased from 8K to 202K after exploding few columns
- Now lets handle the nulls
    - Here we need to fill values for Cast, Director, Country as the null values are more in these columns
    - If we observe the Date_Added and Ratings columns we can drop those as those are considerably less

In [None]:
data['cast'].fillna('No Cast', inplace=True)
data['director'].fillna('No Director', inplace=True)
data['country'].fillna('No Country', inplace=True)
data.dropna(subset=['date_added','rating','duration'], inplace=True)

In [None]:
data.info()

In [None]:
data.isnull().any()

## Challenge #3: Duration
- We can group the data based on duration and perform further analysis
- Things to keep in mind with the duration
    - Duration in mins: Movie
    - Duration in #Seasons: TV Show

## Challenge #4: Datatype of date_added column
- We need to typecast datatype of date_added column

In [None]:
data['date_added'] = pd.to_datetime(data['date_added'])

In [None]:
data.info()

## 1. Lets explore the basic Metrics

In [None]:
data_type = data['type'].value_counts()

In [None]:
plt.figure(figsize=(8,8))
plt.title('Number of show by Type',color='White',size=18)
plt.pie(data_type, labels=data_type.index)
plt.show()

In [None]:
data_type.index