In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from tqdm import tqdm

In [None]:
def pretty_nutriments(nuts):
    print('\n'.join(['{:25}{:.2f}\t{}\t{:.2f}'.format(k, v['quantity'], v['unit'], v['variance']) for k, v in nuts.items()]))

In [None]:
client = Elasticsearch(hosts='http://')

In [None]:
analysis = [a for a in Search(using=client, index='analysis').scan()]

In [None]:
restaurant_analysis = {a.id: a for a in analysis}

In [None]:
geneva = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='genève').scan() if restaurant_analysis.get(r.meta.id)]
len(geneva)

In [None]:
lausanne = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lausanne').scan() if restaurant_analysis.get(r.meta.id)]
len(lausanne)

In [None]:
paris = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='paris').scan() if restaurant_analysis.get(r.meta.id)]
len(paris)

In [None]:
lyon = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='lyon').scan() if restaurant_analysis.get(r.meta.id)]
len(lyon)

In [None]:
bordeaux = [(r, restaurant_analysis.get(r.meta.id)) for r in Search(using=client, index='restaurants').query('match', city='bordeaux').scan() if restaurant_analysis.get(r.meta.id)]
len(bordeaux)

In [None]:
cities = {
    'geneva': geneva,
    'lausanne': lausanne,
    'paris': paris,
    'lyon': lyon,
    'bordeaux': bordeaux
}

In [None]:
units = {}
rests = []
for city, group in cities.items():
    print(city)
    for rest, agg in group:
        if len(agg.analysis):
            for k, v in agg.total.to_dict().items():
                units[k] = v['unit']
            rest = {k: v['quantity'] for k, v in agg.total.to_dict().items()}

            rest['city'] = city
            rest['country'] = 'CH' if city == 'geneva' or city == 'lausanne' else 'FR'
            rests.append(rest)

rests = pd.DataFrame(rests)
plt.show()

In [None]:
rests_robust = rests[rests.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3  if type(x) is str else x).all(axis=1)]

In [None]:
def country_criteria(name):
    sns.barplot(x='country', y=name, data=rests_robust)
    a = rests_robust[rests_robust.country == 'CH'][name]
    b = rests_robust[rests_robust.country == 'FR'][name]
    test = stats.ttest_ind(a, b, equal_var=False)
    pval = ', pval = {:.2f}'.format(test.pvalue)
    count = ', CH/FR = {}/{}'.format(len(a), len(b))
    plt.title('{} ({}) per country{}{}'.format(name, units[name], count, pval))
    plt.show()

In [None]:
def city_criteria(name):
    sns.barplot(x='city', y=name, data=rests_robust)
    plt.title('{} ({}) per city'.format(name, units[name]))
    plt.show()

In [None]:
def criteria(name):
    country_criteria(name)
    city_criteria(name)

In [None]:
criteria('Énergie (kCal)')

In [None]:
criteria('Magnésium')

In [None]:
criteria('Matières grasses')

In [None]:
criteria('Sel')

In [None]:
criteria('Calcium')

In [None]:
criteria('Protéines')

In [None]:
criteria('Acides gras saturées')

In [None]:
criteria('Fibres alimentaires')

In [None]:
criteria('Glucides')

In [None]:
criteria('Sucres')

In [None]:
criteria('Fer')

In [None]:
criteria('Chlore')