# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### graphical matplotlib basics
import matplotlib.pyplot as plt
# for jupyter notebook management
%matplotlib inline

### graphical seaborn basics
import seaborn as sns

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General seaborn functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

### 2.1.1 NETFLIX

In [None]:
df_netflix_raw = dfc.load_dataset_from_config('netflix_data', sep=',')

if df_netflix_raw is not None and isinstance(df_netflix_raw, pd.DataFrame):
    dfc.log_general_info(df_netflix_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_netflix_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_netflix_raw))
    df_netflix = dfc.normalize_column_names(df_netflix_raw)
    display(df_netflix.head())

In [None]:
df_netflix_desc = df_netflix.select_dtypes(include=np.number).describe()
display(df_netflix_desc)
df_netflix_cr = df_netflix.select_dtypes(include=np.number).corr()
display(df_netflix_cr)

### 2.1.2 IMDB

In [None]:
df_imdb_raw = dfc.load_dataset_from_config('imdb_data', sep=',')

if df_imdb_raw is not None and isinstance(df_imdb_raw, pd.DataFrame):
    dfc.log_general_info(df_imdb_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_imdb_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_imdb_raw))
    df_imdb = dfc.normalize_column_names(df_imdb_raw)
    display(df_imdb.head())

In [None]:
df_imdb_desc = df_imdb.select_dtypes(include=np.number).describe()
display(df_imdb_desc)
df_imdb_cr = df_imdb.select_dtypes(include=np.number).corr()
display(df_imdb_cr)

## 2.2 Data quality refinement

### 2.2.1 NETFLIX

In [None]:
# Original backup and duplicates management
df_netflix_orig = df_netflix.copy()
df_netflix = df_netflix.drop_duplicates()

In [None]:
# Analysis of variable rating
dfc.display_variable_info(df_netflix_raw.rating)
print("valeur la plus représentée :",df_netflix_raw.rating.mode()[0])
df_netflix_raw.rating = df_netflix_raw.rating.fillna(df_netflix_raw.rating.mode()[0])

In [None]:
# Analysis of variable director
dfc.display_variable_info(df_netflix.director[:10])
print("valeur la plus représentée :",df_netflix.director.mode()[0])
# pas de changement avec la valeur la plus représentée pour ne pas altérer la répartition des données

### 2.2.1 IMDB

In [None]:
df_imdb_orig = df_imdb.copy()
df_imdb = df_imdb.drop_duplicates()

In [None]:
df_imdb['average_rating'] = pd.to_numeric(df_imdb.rate, errors='coerce')
dfc.display_variable_info(df_imdb.average_rating.sort_values(ascending=False).head(10))

df_imdb['num_votes'] = pd.to_numeric(df_imdb.votes.str.replace(',', ''), errors='coerce')
dfc.display_variable_info(df_imdb.num_votes.sort_values(ascending=False).head(10))

df_imdb = df_imdb.drop_duplicates()

## 2.3 Data combination and rework

In [None]:
all_content = pd.merge(df_netflix, df_imdb, left_on = ['title','release_year'], right_on = ['name','date'], how='inner')
display(all_content.info())
all_movies = all_content[all_content.type_x=='Movie'].copy()
all_movies['duration'] = all_movies.duration_x.apply(lambda dur: dur.replace(' min', '')).astype(int)
display(all_movies.head())

# 3. Data Viz' and Analysis

## 3.1 General Configuration

In [None]:
sns.set_theme(style = "ticks", context = "talk", palette = "bright")

## 3.2 Quantitative mono variable distribution

In [None]:
graph = sns.displot(
    data=all_movies,
    x=all_movies.average_rating.name, # type: ignore[reportArgumentType]
    kind='hist',
    bins=15,
    kde=True,
    height=8,
    aspect=1.5
)
graph.figure.suptitle(
    'Répartition par histogramme et estimation de la densité de noyau pour averageRating',
    y=1.04,
    fontsize=20
)

plt.show()

In [None]:
graph = sns.relplot(
    data=all_movies[(all_movies.country=='United States')&(all_movies.duration<120)],
    x=all_movies.duration.name, # type: ignore[reportArgumentType]
    y=all_movies.average_rating.name, # type: ignore[reportArgumentType]
    kind='line',
    height=8,
    aspect=1.5
)
graph.figure.suptitle(
    'Relation entre la durée d\'un film américain de moins de 120min et sa note moyenne IMDB',
    y=1.04,
    fontsize=20
)

plt.show()

## 3.3 Qualitative mono variable distribution

In [None]:
# data extraction
directors = all_content.director.str.split(', ', expand=True).stack().reset_index(drop=True)
top_directors_list = directors.value_counts().head(8)
top_directors = pd.DataFrame(directors)
top_directors.columns = ['director']

graph = sns.countplot(
    data=top_directors[top_directors.director.isin(top_directors_list.index)],
    y=top_directors.director.name, # type: ignore[reportArgumentType]
    order=top_directors_list.index,
    hue=top_directors[top_directors.director.isin(top_directors_list.index)].director,
    legend=False
)
graph.set_xticks(np.arange(0, 11, 2))
graph.set_xlabel('Nombre de films')
graph.figure.suptitle('Nombre de film pour le top 8 des Directors', y=1.04, fontsize=20)

plt.show()

In [None]:
# data extraction
all_content["year"] = pd.to_datetime(all_content.date_added.str.strip()).dt.year

graph = sns.catplot(
    data=all_content,
    x=all_content.year.name, # type: ignore[reportArgumentType]
    kind='count',
    hue=all_content.type_x.name, # type: ignore[reportArgumentType]
    height=6,
    aspect=1.5
)
graph.figure.suptitle('Nombre de contenu par année et par type', y=1.04, fontsize=20)

plt.show()

## 3.4 Qualitative multi variable distribution

In [None]:
# data extraction
uk_movies = all_content.loc[(all_content.type_x=="Movie") & (all_content.country.str.contains('United Kingdom'))]
uk_series = all_content.loc[(all_content.type_x=="TV Show") & (all_content.country.str.contains('United Kingdom'))]

uk_movies = uk_movies.sort_values(by='num_votes', ascending=False)
uk_series = uk_series.sort_values(by='num_votes', ascending=False)

# Afficher côte à côte deux graphiques.
fig, axs = plt.subplots(figsize=(10, 8), nrows=1, ncols=2)
sns.barplot(
    data=uk_movies[:5]
    ,x=uk_movies.name.name # type: ignore[reportArgumentType]
    ,y=uk_movies.num_votes.name # type: ignore[reportArgumentType]
    ,ax=axs[0]
)
sns.barplot(
    data=uk_series[:5]
    ,x=uk_series.name.name # type: ignore[reportArgumentType]
    ,y=uk_series.num_votes.name # type: ignore[reportArgumentType]
    ,ax=axs[1]
)

# inclinaison des valeurs X et Y
axs[0].tick_params(axis='x', labelrotation=45)
for label in axs[0].get_xticklabels():
    label.set_ha('right')
axs[1].tick_params(axis='x', labelrotation=45)
for label in axs[1].get_xticklabels():
    label.set_ha('right')

# modification des valeurs affichées pour Y (votes)
axs[0].set_yticks(
    [0, 500000, 1000000, 1500000, 2000000, 2500000]
    ,['0', '500k', '1M', '1,5M', '2M', '2,5M']
)
axs[1].set_yticks(
    [0, 5000, 10000, 20000, 30000]
    ,['0', '5k', '10k', '20k', '30k']
)

# modification des labels X et Y
axs[0].set_xlabel("Films anglais les plus populaires")
axs[1].set_xlabel("Séries anglaises les plus populaires")
axs[0].set_ylabel("Nombre de votes")
axs[1].set_ylabel("Nombre de votes")

# ajout du titre
fig.suptitle('Top 5 des séries et films anglais les plus populaires', y=1.02, fontsize=20)

plt.tight_layout()
plt.show()

## 3.5 Quantitative multi variable correlation

In [None]:
# data extraction
all_content["year"] = pd.to_datetime(all_content.date_added.str.strip()).dt.year

graph = sns.lmplot(
    data=all_content,
    x=all_content.year.name, # type: ignore[reportArgumentType]
    y=all_content.average_rating.name, # type: ignore[reportArgumentType]
    lowess=True,
    height=6,
    aspect=1.5
)
graph.figure.suptitle(
    'Evolution de la qualité du contenu du catalogue au fil du temps (régression linéaire par année)',
    y=1.04,
    fontsize=20)

plt.show()