In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import missingno as msno
import pycountry

os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray

# import pandas as pd
import modin.pandas as pd
import ray
ray.init(ignore_reinit_error=True)

from scipy import stats

import warnings
warnings.filterwarnings("ignore")



In [2]:
%run ./module_P2_utils.ipynb

In [3]:
df = pd.read_pickle("./df_01_cleaned.pkl")

<center><img src="assets/SEO-analysis.jpg" alt="drawing" width="500"/></center>

# ANALYSIS


Main features that we'll anaylise below :

| Feature | Type |
|-----|-----|
code	                | object	            |0.000000
states	                | object	            |0.000000
states_en	            | object	            |0.000000
states_tags	            | object	            |0.000000
created_datetime	    | datetime64[ns, UTC]	|0.000000
last_modified_datetime	| datetime64[ns, UTC]	|0.000000
creator	                | object	            |0.000201
pnns_groups_2	        | object	            |0.009807
pnns_groups_1	        | object	            |0.009907
countries	            | object	            |0.307572
countries_tags	        | object	            |0.307824
countries_en	        | object	            |0.307824
product_name	        | object	            |4.101583
energy_100g	            | float64               |20.664821
proteins_100g	        | float64               |21.001209
fat_100g	            | float64               |21.063015
carbohydrates_100g	    | float64               |21.081069
sugars_100g	            | float64               |21.867249
saturated-fat_100g	    | float64               |23.229649
energy-kcal_100g	    | float64               |23.248005
salt_100g	            | float64               |25.224544
sodium_100g	            | float64               |25.224695


In particular, let's investigate pnns_groups_1 & pnns_groups_2 [PNNS website](https://www.mangerbouger.fr/PNNS) related to:

* energy_100g
* proteins_100g
* fat_100g
* carbohydrates_100g
* sugars_100g
* saturated-fat_100g
* energy-kcal_100g
* salt_100g
* sodium_100g

First, we need to check for existing outliers, and remove them in order to get a good analysis afterwards

## Outliers analysis

### Outliers for Qualitative Values

This feature has a multi modal distribution

## Qualitative Values

With a pre-filtered dataset.

Let's look at the qualitative values, which ones are of interest


### dispertion of values

Let's show object features with their unique values : we are looking for features that contains just a few values, otherwise this is not interesting

In [None]:
df.columns[df.dtypes == 'object']

We remove the columns with a higher percentage of not unique values, because they will result in a too big variance

So, we should only keep:

countries_en, additives_tags, nutriscore_grade, pnns_groups_1,pnns_groups_2,ecoscore_grade_fr,

In [None]:
## Quantitative Values

### Empirical Variance (Variance Empirique)

$ \upsilon = \frac{1}{n} \sum_{i=1}^n(x_i - \bar{x})²$

### Standard Deviation (Ecart-Type Empirique)

$ s = \sqrt{\upsilon} $


In [None]:
for col in df.select_dtypes('object'):
    print(f'{col} {(df[col].unique().size / df[col].size):.0%}')

In [None]:
pd.crosstab(df['countries'], df['nutriscore_grade'])

## Qualitative Features

### Categorization

In [None]:
%run ./module_P2_utils.ipynb

In [None]:
df['countries'].unique().size

In [None]:
countries_modalities = df['countries'].value_counts().index
tab = pd.DataFrame(countries_modalities, columns = ['countries'])
tab

In [None]:
df['countries'].sample(20)

`countries` feature contains lots of different values, despite we can transform and categorize them, using standardized country code 'XX'

In [None]:
# add new column to dataset with standardized country code
df['countries_cleaned'] = df['countries'].fillna("XX").apply(cleanCategory)
print(f"our dataset results with {df['countries_cleaned'].unique().size} different country codes")

In [None]:
# Some values represent several countries and seperate them with ','
# This algorithm cannot process them, what is the impact ?
df[['countries', 'countries_cleaned']][df['countries'].fillna('').str.contains(',')].sample(25)

In [None]:
print(f"{df[['countries']][df['countries'].fillna('').str.contains(',')].count().values} rows have a country label possibly not standardized correctly")

~100K lines contains countries code we are not able to process correclty, which represents about 5% of the dataset, which is insignificant.

Note : theses lines are not well explained, so we are not able to understand correctly what it means when several countries seperated by a comma appears in this columns

Now, let's transform `countries_cleaned` feature with numerical data so we can make statistics on it

In [None]:
df['countries_cleaned'] = df['countries_cleaned'].astype('category')

In [None]:
# Example of application
pd.crosstab(df['countries_cleaned'], df['nutriscore_grade'])

In [None]:
df['countries_cleaned'].cat.codes.unique().max()
# df[df['countries_cleaned'] == 'FR']

What are the most represented countries ?

In [None]:
df['countries_cleaned'].mode()

In [None]:
most_countries = df.value_counts(subset='countries_cleaned').head(3).index
df.loc[~df['countries_cleaned'].isin(most_countries)].shape

In [None]:
df['countries_cleaned'].value_counts(normalize=True).head(20).plot(kind='bar', figsize=(8,8), title='distribution of countries in dataset (top 20)')

The diagram above shows our dataset is composed of product coming in majority from France, US, Spain and Italy. The other countries represents less than 5% each.

TODO : mesurer le taux de score grade entre FR et US --> est-ce qu'on mange mieux en France ?

In [None]:
sns.displot(data=df, hue='nutriscore_grade', x='nutriscore_score', kind='ecdf')

In [None]:
df['nutriscore_grade_cat'] = df['nutriscore_grade'].astype('category')

In [None]:
df['nutriscore_grade_cat'].dropna().cat.categories

In [None]:
df['nutriscore_grade_cat'].dropna().cat.codes.unique()

In [None]:
# show product names for country FR
df['product_name'].unique().size

In [None]:
pd.DataFrame({'columns': df.columns, 'multimode': getMultiModes(df, df.columns)})

## Analysis

In [None]:
pd.crosstab(df['nutriscore_grade'], df['pnns_groups_1'] )

In [None]:
pd.crosstab(df['nutriscore_grade'], df['pnns_groups_2'] )

In [None]:
plt.figure(figsize=(20,20))
# sns.displot(df, x='nutriscore_grade')
sns.displot(df, x='nutriscore_score', hue='nutriscore_grade', kind='ecdf')
plt.savefig("nutriscore_grade_multivariate_analysis.png", format='png', dpi=150)
sns.displot(df, x='nutriscore_score', y='ecoscore_score_fr', hue='nutriscore_grade')
# sns.displot(df, x='nutriscore_score', y= 'nova_group', hue='nutriscore_grade', kind='kde')
# sns.displot(df, x='nutriscore_score', col='nutriscore_grade')
plt.savefig("nutriscore_ecoscore_multivariate_analysis.png", format='png', dpi=150)

In [None]:
df['nutriscore_grade'].value_counts(normalize=True).plot.pie()

In [None]:
df['nutriscore_score'].dropna().value_counts().sort_values(ascending=False)


We can see that some columns still have lots of empty values, but also that several rows have almost (or all) the features provided,
which is good for further analysis



In [None]:
# most filled features
df.columns[[0, 1, 11, 12, 13, 25, 26, 27, 28, 29]]

In [None]:
df.columns.get_loc('nutriscore_grade')