# Projet sante publique france

## Déjà réalisé



## Analyses univariées

In [None]:
import os
import re
from pprint import pprint

from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats
from matplotlib.offsetbox import (TextArea, DrawingArea,
                                  OffsetImage, AnnotationBbox)
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from tabulate import tabulate
import unicodedata
from wordcloud import WordCloud

from src.features.univar import UnivariateAnalysis
from src.visualization import visualize as viz

plt.rcdefaults()
font = {'size': 16}

sns.set()
plt.rc('font', **font)

In [None]:
%matplotlib inline

In [None]:
pd.options.display.max_rows = 999

In [None]:
# On récupère les données du second traitement
data = pd.read_pickle('../data/interim/products_interimV2.pickle')
univar = UnivariateAnalysis(data)

In [None]:
for col in data.columns:
    print(col)

In [None]:
data.shape

In [None]:
data['main_category_en'] = data['main_category_en'].astype('category')
data['nutriscore_grade'] = data['nutriscore_grade'].astype(pd.CategoricalDtype(ordered=True))

In [None]:
data['nutriscore_grade']

In [None]:
data.describe()

In [None]:
data.head()

### Catégories

In [None]:
wc = WordCloud(width=600, height=600, background_color='white',
               min_font_size=5, max_font_size=40, relative_scaling=0.2)
wc.generate_from_frequencies(data['main_category_en'].value_counts().to_dict())

In [None]:
wc.to_image()
wc.to_file('../reports/figures/wc_categories.png')

In [None]:
wc = WordCloud(width=600, height=600, background_color='white',
               min_font_size=2, max_font_size=40, relative_scaling=0.5)
wc.generate_from_frequencies(data['pnns_groups_2'].value_counts().to_dict())

In [None]:
wc.to_image()
wc.to_file('../reports/figures/wc_pnns_g2.png')

### Nutriscore et nutrigrade

In [None]:
nutscore = ["#2D7F44", "#97BB39", "#F4D113", "#D67C1C", "#C6341B"]

g = viz.RepartitionPlot(data=data, var='nutriscore_grade',
                        plot_type='pie')
g.plot(colors=nutscore, explode=(0.1, 0, 0, 0, 0),
       autopct='%1.1f%%',)
plt.savefig('../reports/figures/nutriscore_pie.png')

### Valeurs _100g

Energie, protéines, graisse, sucres

In [None]:
for col in data.columns:
    if col.endswith('_100g'):
        univar.make_analysis(col, save=True)

In [None]:
for col in data.columns:
    if col.endswith('_100g'):
        print('=' * 80)
        print(col)
        var = 'pnns_groups_2'
        box_stats = boxplot_stats(data[col]).pop(0)
        whishi = box_stats.get('whishi')
        whislo = box_stats.get('whislo')
        print('whishi', whishi, 'whislo', whislo)
        low = data[data[col] < whislo]
        high = data[data[col] > whishi]
        n_class = high[var].drop_duplicates().shape[0]
        if n_class > 5:
            n_class = 5
            others_cat = True
        if low.shape[0] > 0:
            g = viz.RepartitionPlot(data=low, var=var,
                                    plot_type='bar', max_class=n_class)
            g.plot(orient='h', figsize=(5, 2), others_cat=others_cat)
            plt.savefig(f'../reports/figures/outliers_low_{col}.png')
        elif high.shape[0] > 0:
            g = viz.RepartitionPlot(data=high, var=var,
                                    plot_type='bar', max_class=n_class)
            g.plot(orient='h', figsize=(3, 2), others_cat=others_cat)
            plt.savefig(f'../reports/figures/outliers_high_{col}.png')
        else:
            print("No outliers detected")
        print('=' * 80)


### Marques

In [None]:
g = viz.RepartitionPlot(data=data, var='brands', plot_type='bar')
g.plot(others_cat=False, orient='h')
plt.savefig('../reports/figures/brands_repartition.png')

### PNNS groups

In [None]:
g = viz.RepartitionPlot(data=data, var='pnns_groups_1',
                        plot_type='bar', max_class=10)
g.plot(orient='h', others_cat=False)
plt.savefig('../reports/figures/pnns_1_repartition.png')

In [None]:
g = viz.RepartitionPlot(data=data, var='pnns_groups_2',
                        plot_type='bar', max_class=39)
g.plot(orient='h', others_cat=False)
plt.savefig('../reports/figures/pnns_2_repartition_full.png')

In [None]:
g = viz.RepartitionPlot(data=data, var='pnns_groups_2',
                        plot_type='bar', max_class=12)
g.plot(orient='h', others_cat=True)
plt.savefig('../reports/figures/pnns_2_repartion_compact.png')

## Bivariées

In [None]:
# TODO: stacked-bar, hue is hardcoded for now

### Marques / Nutriscore

In [None]:
nutriscore_logo = mpimg.imread('../reports/figures/logos/nutriscore.png')

In [None]:
g = viz.RepartitionPlot(data=data, var='brands',
                        plot_type='stacked-bar', max_class=20)
g.plot(others_cat=False, colors=nutscore)

imagebox = OffsetImage(nutriscore_logo, zoom=0.5)
ab = AnnotationBbox(imagebox, (4000, 1))
ax = plt.gca()
ax.add_artist(ab)
ax.legend_ = None
plt.savefig('../reports/figures/brands_nutscore_repartition.png')

In [None]:
g = viz.RepartitionPlot(data=data, var='brands', plot_type='stacked-bar',
                        max_class=20)
g.plot(others_cat=True, colors=nutscore, frequency=True,
       sort='labels', ascending=False)

ax = plt.gca()
ax.legend_ = None
plt.savefig('../reports/figures/brands_nutscore_repartition_freq.png')

### PNNS groups / Nutriscore

In [None]:
g = viz.RepartitionPlot(data=data, var='pnns_groups_1',
                        plot_type='stacked-bar', max_class=9)
g.plot(others_cat=False, orient='h', colors=nutscore)
imagebox = OffsetImage(nutriscore_logo, zoom=0.5)
ab = AnnotationBbox(imagebox, (16000, 1))
ax = plt.gca()
ax.add_artist(ab)
ax.legend_ = None
plt.savefig('../reports/figures/pnns1_nutscore_repartition.png')

In [None]:
g = viz.RepartitionPlot(data=data, var='pnns_groups_1',
                        plot_type='stacked-bar', max_class=9)
g.plot(orient='h', colors=nutscore, others_cat=False, frequency=True,
       sort='labels', ascending=False)
plt.savefig('../reports/figures/pnns1_nutscore_repartition_freq.png')

In [None]:
cont = data.groupby(['pnns_groups_1', 'nutriscore_grade'])\
.size().reset_index().pivot(columns='pnns_groups_1', index='nutriscore_grade', values=0)
cont = cont.astype(int)

In [None]:
cont['Sum'] = cont.sum(axis=1)
cont.T.to_latex('../reports/latex-report/contingency_tab.tex')

In [None]:
g = viz.RepartitionPlot(data=data, var='pnns_groups_2',
                        plot_type='stacked-bar',
                        max_class=45)
g.plot(orient='h', colors=nutscore, others_cat=False, frequency=False,
       sort='labels', ascending=False, figsize=(10, 12))
plt.legend(loc='upper right', bbox_to_anchor=(1.22, 0.5))
plt.tight_layout()
plt.savefig('../reports/figures/pnns_2_nutscore_repartition.png')

### Lien entre la valeur énergétique et les nutriments

In [None]:
sns.pairplot(data[['energy_100g', 'fat_100g',
                   'carbohydrates_100g', 'proteins_100g',
                   'nutriscore_grade']], hue="nutriscore_grade",
             palette=nutscore)
plt.figsave('../reports/figures/relational_plot.png')

### Valeur énergétique / Nutriscore

In [None]:
g = viz.RepartitionPlot(data=data, var='energy_100g', plot_type='boxplot')
g.plot(hue='nutriscore_grade', palette=nutscore)
plt.xlabel('Nutriscore')
plt.ylabel('Energy')
plt.savefig('../reports/figures/box_plot_energy_nutscore.png')

In [None]:
for score in range(ord('a'), ord('f')):
    var = 'pnns_groups_2'
    others_cat = False
    score = chr(score)
    print(f'score {score.upper()}')
    _data = data[data['nutriscore_grade'] == score]
    box_stats = boxplot_stats(_data['energy_100g']).pop(0)
    whishi = box_stats.get('whishi')
    whislo = box_stats.get('whislo')
    print('whishi', whishi, 'whislo', whislo)
    low = _data[_data['energy_100g'] < whislo]
    high = _data[_data['energy_100g'] > whishi]
    n_class = high[var].drop_duplicates().shape[0]
    if n_class > 5:
            n_class = 5
            others_cat = True
    if low.shape[0] > 0:
        g = viz.RepartitionPlot(data=low, var=var,
                                plot_type='bar', max_class=n_class)
        g.plot(orient='h', figsize=(5, 2), others_cat=others_cat)
        plt.savefig(f'../reports/figures/outliers_low_{score.upper()}.png')
    if high.shape[0] > 0:
        g = viz.RepartitionPlot(data=high, var=var,
                                plot_type='bar', max_class=n_class)
        g.plot(orient='h', figsize=(3, 2), others_cat=others_cat)
        plt.savefig(f'../reports/figures/outliers_high_{score.upper()}.png')
    print('=' * 80)

### Valeur énergétique selon le groupe PNNS 1

In [None]:
fig, axes = plt.subplots(3, 3, sharex=True, sharey=True, figsize=(9, 9))
pnns_groups = data['pnns_groups_1'].drop_duplicates().values
pnns_groups = pnns_groups.tolist()
pnns_groups.remove('unknown')
pnns_groups.sort()
for group, ax, i in zip(pnns_groups, axes.reshape(-1),
                     range(len(pnns_groups))):
    # print(group, ax)
    _data = data[data['pnns_groups_1'] == group]
    sns.distplot(_data['energy_100g'], ax=ax, kde=True)
    ax.set_title(group)
    if i < 6:
        ax.set_xlabel("")
plt.subplots_adjust(wspace=0.05, hspace=0.2)
plt.savefig('../reports/figures/dist_energy_pnns1.png')

### groupes PNNS: catégorie d'aliment et nutriments moyens /100g

In [None]:
for group in data['pnns_groups_1'].drop_duplicates().values:
    print(group)
    n_class = len(data[data['pnns_groups_1'] == group]['main_category_en']\
    .drop_duplicates().values)
    others_cat = False
    if n_class > 5:
        n_class = 5
        others_cat = True
    g = viz.RepartitionPlot(data[data['pnns_groups_1'] == group], 
                            var='main_category_en', plot_type='bar',
                            max_class=n_class)
    g.plot(orient='h', figsize=(3, 1.7), others_cat=others_cat)
    plt.show()

In [None]:
for group in data['pnns_groups_1'].drop_duplicates().values:
    print(group)
    n_class = len(data[data['pnns_groups_1'] == group]['pnns_groups_2']\
    .drop_duplicates().values)
    others_cat = False
    if n_class > 5:
        n_class = 5
        others_cat = True
    g = viz.RepartitionPlot(data[data['pnns_groups_1'] == group], 
                            var='pnns_groups_2', plot_type='bar',
                            max_class=n_class)
    g.plot(orient='h', figsize=(3, 1.7), others_cat=others_cat)
    plt.show()

Version control

In [None]:
commit = input('Commit ?')
if not commit:
    raise ValueError()

In [None]:
!git commit ./3.0-tg-uni-bi-variate-analysis.ipynb -m ":construction_worker: Analysis almost finished!"