# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### graphical matplotlib basics
import matplotlib.pyplot as plt
# for jupyter notebook management
%matplotlib inline
%config Completer.use_jedi = False

### graphical seaborn basics
import seaborn as sns

### graphical plotly basics
# import plotly.graph_objects as go
# import plotly.express as px


## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General seaborn functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

### 2.1.1 NETFLIX

In [None]:
df_netflix = dfc.load_dataset_from_config('netflix_data', sep=',')

if df_netflix is not None and isinstance(df_netflix, pd.DataFrame):
    display(df_netflix.head())
    dfc.log_general_info(df_netflix)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_netflix)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_netflix))

### 2.1.2 IMDB

In [None]:
df_imdb = dfc.load_dataset_from_config('imdb_data', sep=',')

if df_imdb is not None and isinstance(df_imdb, pd.DataFrame):
    display(df_imdb.head())
    dfc.log_general_info(df_imdb)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_imdb)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_imdb))

## 2.2 Data quality refinement

### 2.2.1 NETFLIX

In [None]:
if df_netflix is not None and isinstance(df_netflix, pd.DataFrame):
	df_netflix_orig = df_netflix.copy()
	df_netflix = df_netflix.drop_duplicates()
else:
	print("df_netflix is not loaded correctly.")

In [None]:
# Analysis of variable rating
dfc.display_variable_info(df_netflix.rating)
print("valeur la plus représentée :",df_netflix.rating.mode()[0])
df_netflix.rating = df_netflix.rating.fillna(df_netflix.rating.mode()[0])

In [None]:
# Analysis of variable director
dfc.display_variable_info(df_netflix.director[:10])
print("valeur la plus représentée :",df_netflix.director.mode()[0])
# pas de changement avec la valeur la plus représentée pour ne pas altérer la répartition des données

### 2.2.1 IMDB

In [None]:
if df_imdb is not None and isinstance(df_imdb, pd.DataFrame):
	df_imdb_orig = df_imdb.copy()
	df_imdb = df_imdb.drop_duplicates()
else:
	print("df_imdb is not loaded correctly.")

In [None]:
df_imdb['averageRating'] = pd.to_numeric(df_imdb.Rate, errors='coerce')
dfc.display_variable_info(df_imdb.averageRating.sort_values(ascending=False).head(10))

df_imdb['numVotes'] = pd.to_numeric(df_imdb.Votes.str.replace(',', ''), errors='coerce')
dfc.display_variable_info(df_imdb.numVotes.sort_values(ascending=False).head(10))

df_imdb = df_imdb.drop_duplicates()

## 2.3 Data combination and rework

In [None]:
all_content = pd.merge(df_netflix, df_imdb, left_on = ['title','release_year'], right_on = ['Name','Date'], how='inner')
display(all_content.head())
all_movies = all_content[all_content.type=='Movie'].copy()
all_movies['duration'] = all_movies.duration.apply(lambda dur: dur.replace(' min', '')).astype(int)
display(all_movies.head())

# 2. Data Viz' and Analysis

## 2.1 General Data Viz'

### 2.1.1 Seaborn

In [None]:
sns.set_theme(style = "ticks", context = "talk", palette = "bright")

## 2.1 Quantitative mono variable distribution

### 2.1.2 Seaborn

In [None]:
graph = sns.displot(
    data=all_movies,
    x=all_movies.averageRating.name, # type: ignore[reportArgumentType]
    kind='hist',
    bins=15,
    kde=True,
    height=8,
    aspect=1.5
)
graph.fig.suptitle(
    'Répartition par histogramme et estimation de la densité de noyau pour averageRating',
    y=1.04,
    fontsize=20
)

plt.show()

In [None]:
graph = sns.relplot(
    data=all_movies[(all_movies.country=='United States')&(all_movies.duration<120)],
    x=all_movies.duration.name, # type: ignore[reportArgumentType]
    y=all_movies.averageRating.name, # type: ignore[reportArgumentType]
    kind='line',
    height=8,
    aspect=1.5
)
graph.fig.suptitle(
    'Relation entre la durée d\'un film américain de moins de 120min et sa note moyenne IMDB',
    y=1.04,
    fontsize=20
)

plt.show()

## 2.2 Qualitative mono variable distribution

### 2.2.3 Seaborn

In [None]:
# data extraction
directors = all_content.director.str.split(', ', expand=True).stack().reset_index(drop=True)
top_directors_list = directors.value_counts().head(8)
top_directors = pd.DataFrame(directors)
top_directors.columns = ['Director']

graph = sns.countplot(
    data=top_directors[top_directors.Director.isin(top_directors_list.index)],
    y=top_directors.Director.name, # type: ignore[reportArgumentType]
    order=top_directors_list.index,
    hue=top_directors[top_directors.Director.isin(top_directors_list.index)].Director,
    legend=False
)
graph.set_xticks(np.arange(0, 11, 2))
graph.set_xlabel('Nombre de films')
graph.figure.suptitle('Nombre de film pour le top 8 des Directors', y=1.04, fontsize=20)

plt.show()

In [None]:
# data extraction
all_content["year"] = pd.to_datetime(all_content.date_added.str.strip()).dt.year

graph = sns.catplot(
    data=all_content,
    x=all_content.year.name, # type: ignore[reportArgumentType]
    kind='count',
    hue=all_content.type.name, # type: ignore[reportArgumentType]
    height=6,
    aspect=1.5
)
graph.figure.suptitle('Nombre de contenu par année et par type', y=1.04, fontsize=20)

plt.show()

## 2.3 Qualitative multi variable distribution

### 2.3.3 Seaborn

In [None]:
# data extraction
uk_movies = all_content.loc[(all_content.type=="Movie") & (all_content.country.str.contains('United Kingdom'))]
uk_series = all_content.loc[(all_content.type=="TV Show") & (all_content.country.str.contains('United Kingdom'))]

uk_movies = uk_movies.sort_values(by='numVotes', ascending=False)
uk_series = uk_series.sort_values(by='numVotes', ascending=False)

# Afficher côte à côte deux graphiques.
fig, axs = plt.subplots(figsize=(10, 8), nrows=1, ncols=2)
sns.barplot(
    data=uk_movies[:5]
    ,x=uk_movies.Name.name # type: ignore[reportArgumentType]
    ,y=uk_movies.numVotes.name # type: ignore[reportArgumentType]
    ,ax=axs[0]
)
sns.barplot(
    data=uk_series[:5]
    ,x=uk_series.Name.name # type: ignore[reportArgumentType]
    ,y=uk_series.numVotes.name # type: ignore[reportArgumentType]
    ,ax=axs[1]
)

# inclinaison des valeurs X et Y
axs[0].tick_params(axis='x', labelrotation=45)
for label in axs[0].get_xticklabels():
    label.set_ha('right')
axs[1].tick_params(axis='x', labelrotation=45)
for label in axs[1].get_xticklabels():
    label.set_ha('right')

# modification des valeurs affichées pour Y (votes)
axs[0].set_yticks(
    [0, 500000, 1000000, 1500000, 2000000, 2500000]
    ,['0', '500k', '1M', '1,5M', '2M', '2,5M']
)
axs[1].set_yticks(
    [0, 5000, 10000, 20000, 30000]
    ,['0', '5k', '10k', '20k', '30k']
)

# modification des labels X et Y
axs[0].set_xlabel("Films anglais les plus populaires")
axs[1].set_xlabel("Séries anglaises les plus populaires")
axs[0].set_ylabel("Nombre de votes")
axs[1].set_ylabel("Nombre de votes")

# ajout du titre
fig.suptitle('Top 5 des séries et films anglais les plus populaires', y=1.02, fontsize=20)

plt.tight_layout()
plt.show()

## 2.4 Quantitative multi variable correlation

### 2.4.2 Seaborn

In [None]:
# data extraction
all_content["year"] = pd.to_datetime(all_content.date_added.str.strip()).dt.year

graph = sns.lmplot(
    data=all_content,
    x=all_content.year.name, # type: ignore[reportArgumentType]
    y=all_content.averageRating.name, # type: ignore[reportArgumentType]
    lowess=True,
    height=6,
    aspect=1.5
)
graph.figure.suptitle(
    'Evolution de la qualité du contenu du catalogue au fil du temps (régression linéaire par année)',
    y=1.04,
    fontsize=20)

plt.show()