In [10]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from numpy import percentile
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import shapiro
import pickle 
from sklearn.neighbors import KDTree
import seaborn as sns


In [11]:
df=(
    pd.read_csv("Data/df.csv",sep="\t",low_memory=False)
    .drop(columns=['url',
    'creator',
    'image_url',
    'image_small_url',
    'image_ingredients_url',
    'image_ingredients_small_url',
    'image_nutrition_url',
    'image_nutrition_small_url',
    'created_datetime',
    'last_modified_datetime',
    'created_t',
    'last_modified_t',
    'states',
    'states_tags',
    'states_en'],axis=1))

In [12]:
df.shape

(1209317, 164)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209317 entries, 0 to 1209316
Columns: 164 entries, code to carnitine_100g
dtypes: float64(121), object(43)
memory usage: 1.5+ GB


Here we find ourselves with lot of data on both rows and columns. 

We have already deleted some columns on the 16 columns import. 

There are two things we should already look at: missing data and redundant data. 
   

In [14]:
df.head(n=4)

Unnamed: 0,code,product_name,generic_name,quantity,packaging,packaging_tags,brands,brands_tags,categories,categories_tags,...,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
0,17,Vitória crackers,,,,,,,,,...,,,,,,,,,,
1,31,Cacao,,130 g,,,,,,,...,,,,,,,,,,
2,3327986,Filetes de pollo empanado,,,,,,,,,...,,,,,,,,,,
3,100,moutarde au moût de raisin,,100g,,,courte paille,courte-paille,"Epicerie, Condiments, Sauces, Moutardes","en:groceries,en:condiments,en:sauces,en:mustards",...,,18.0,,,,,,,,


In [15]:
df[['nutrition-score-fr_100g','nutrition-score-uk_100g']].isna().sum()

nutrition-score-fr_100g     869666
nutrition-score-uk_100g    1209271
dtype: int64

here we can see that the nutrition_score_fr is fuller than the english one dc we will exploit it

In [16]:
df=df.loc[~df['nutrition-score-fr_100g'].isna(),:]
df.shape

(339651, 164)

How is the score of a product calculated?

To classify each product, international research teams have developed a score that takes into account, per 100 grams of product, the content :

- in nutrients and foods to be promoted: fiber, protein, fruit and vegetables
- in nutrients to be limited: energy, saturated fatty acids, sugars, salt

In [17]:
var_importante=['code','product_name','generic_name','quantity',
                'fiber_100g','proteins_100g','fruits-vegetables-nuts_100g',
                'energy_100g','saturated-fat_100g','sugars_100g','salt_100g','nutrition-score-fr_100g','carbohydrates_100g'
               ,'fat_100g','nutriscore_grade']
var_nutri=['fiber_100g','proteins_100g','fruits-vegetables-nuts_100g',
                'energy_100g','saturated-fat_100g','sugars_100g','salt_100g','carbohydrates_100g','fat_100g']

In [18]:
df=df[var_importante].copy()

In [19]:
df['Numb_nan_line']=df[var_nutri].apply(lambda x: x.count(),axis=1)

In [20]:
df.head()

Unnamed: 0,code,product_name,generic_name,quantity,fiber_100g,proteins_100g,fruits-vegetables-nuts_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,nutrition-score-fr_100g,carbohydrates_100g,fat_100g,nutriscore_grade,Numb_nan_line
3,100,moutarde au moût de raisin,,100g,0.0,5.1,,936.0,2.2,22.0,4.6,18.0,29.0,8.2,d,8
13,949,Salade de carottes râpées,,,,0.9,,134.0,0.1,3.9,0.42,1.0,5.3,0.3,b,7
21,1281,Tarte noix de coco,,,4.4,4.6,,1594.0,15.5,21.9,0.1,14.0,27.3,22.0,d,8
30,1885,Compote de poire,,,3.6,0.6,,657.0,0.0,27.0,0.0,-2.0,36.0,0.0,a,8
34,2257,Salade de macedoine de légumes,,,,1.9,,598.0,1.0,1.0,0.27,1.0,3.9,12.7,b,7


In [21]:
# We want to delete the columns with only have Nan values
df1=df.loc[df['Numb_nan_line']!=0,:].copy()
df.shape

(339651, 16)

In [22]:
df1[df1.duplicated(['code'])]

Unnamed: 0,code,product_name,generic_name,quantity,fiber_100g,proteins_100g,fruits-vegetables-nuts_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,nutrition-score-fr_100g,carbohydrates_100g,fat_100g,nutriscore_grade,Numb_nan_line
146885,10430652874676217,"Hackfleisch vom Rind und Schwein gemischt, zum...",,300 g,,18.0,,1046.0,8.6,0.0,0.18,11.0,0.0,20.0,d,7
354605,2559656003005,chorizo bio,,300g.,,36.0,,1954.0,13.5,1.0,3.5,25.0,2.0,35.0,e,7
360379,2602638029651,Louis auvergne saucisson bio,,,,25.0,,1648.0,13.3,1.39,4.5,24.0,3.1,31.299999,e,7
478695,3263851538918,Pointes d'asperges pelées main,,190 g,,1.7,,67.0,0.0,1.2,0.9,-3.0,1.6,0.0,a,7
520361,3324498002153,Marrons entiers naturels,Marrons entiers naturels,200g,,2.7,100.0,953.0,0.2,7.2,0.0,-3.0,49.0,1.0,a,8
535827,3350033435445,Crevettes tropicales cuites décortiquées,,100 g,,20.0,,356.0,0.2,0.0,1.0,0.0,0.0,0.6,b,7
563723,3428420053203,"Buche Creme au beurre, parfum chocolat",,500 g,,4.6,,1594.0,13.0,34.0,0.24,22.0,42.0,21.0,e,7
572357,3450970032995,Jus de pomme,Jus de pomme à base de jus de pomme concentré,1 L,0.0,0.5,,172.0,0.1,9.5,0.01,3.0,10.0,0.5,c,8
783870,4300175485890,Weizenmehl Type 1050 / Farine de Blé type 1050,Bio Weizenmehl Type 1050,1 kg,5.5,9.8,,1433.0,0.3,0.5,0.01,-6.0,68.0,1.8,a,8
880313,5425021250251,Filet de poulet sauce Chakchouka,,,,5.9,,481.0,0.3,0.5,0.79,1.0,15.0,3.7,b,7


In [23]:
#Here we can observe one thing, we have the same products but at the end nutriscore is different
#So we are going to delete this kind of raw to avoid any misunderstanding with our ML's algo
df.loc[df['code']=='8850123125027',:]

Unnamed: 0,code,product_name,generic_name,quantity,fiber_100g,proteins_100g,fruits-vegetables-nuts_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,nutrition-score-fr_100g,carbohydrates_100g,fat_100g,nutriscore_grade,Numb_nan_line
1183091,8850123125027,ขนมปังไส้หมูหยองน้ำสลัด,,60 g,3.33,11.7,,1260.0,2.5,16.7,0.917,9.0,45.0,7.5,c,8
1183092,8850123125027,ขนมปังไส้หมูหยองน้ำสลัด,,60 g,3.33,11.7,,1260.0,2.5,16.7,0.917,-1.0,45.0,7.5,a,8


In [24]:
df1.shape

(338627, 16)

In [25]:
df1.isna().sum()

code                                0
product_name                     1925
generic_name                   261812
quantity                       141647
fiber_100g                     197210
proteins_100g                     470
fruits-vegetables-nuts_100g    333547
energy_100g                       435
saturated-fat_100g                487
sugars_100g                       482
salt_100g                          77
nutrition-score-fr_100g             0
carbohydrates_100g                726
fat_100g                          474
nutriscore_grade                    9
Numb_nan_line                       0
dtype: int64

In [26]:
#We are going to delete all the products with the same code, since we don't know which one is correct. 
# Furthermore, the sample is negligeable 
df1.drop_duplicates(subset ="code", 
                     keep = False, inplace = True)

In [27]:
df1.shape

(338595, 16)

In [28]:
df1.loc[df1['code']=='8850123125027',:]

Unnamed: 0,code,product_name,generic_name,quantity,fiber_100g,proteins_100g,fruits-vegetables-nuts_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,nutrition-score-fr_100g,carbohydrates_100g,fat_100g,nutriscore_grade,Numb_nan_line


In [29]:
#Split le dataset 70/30
train, test = train_test_split(df1, test_size=0.3)

In [30]:
print("Le jeu de données train est de  ",train.shape)
print("Le jeu de données test est de  ",test.shape)

Le jeu de données train est de   (237016, 16)
Le jeu de données test est de   (101579, 16)


# Preprocessing : Outliers 

 Here we will apply two methods for cleaning our columns: 
        - The first approach is to clean the columns where we are able to judge for ourselves the anomalies. For example, our columns with a rate of 100g can not be more than 100 nor less than 100. Then we will apply the boxplot method to determine the outliers of each of these columns. 
        - The second method consists in working on columns where we are not really expert in the field. For example, the energy rate I am not able to determine an outlier for this. To remedy this, I will go through the percentile method and remove 1% and the 99% to remove outliers.  

In [31]:
#We want to keep a copy of the original dataset to make a comparaison after
train_origine=train.copy()

In [32]:
train.isna().sum()

code                                0
product_name                     1341
generic_name                   183242
quantity                        99165
fiber_100g                     138350
proteins_100g                     318
fruits-vegetables-nuts_100g    233440
energy_100g                       294
saturated-fat_100g                327
sugars_100g                       326
salt_100g                          48
nutrition-score-fr_100g             0
carbohydrates_100g                494
fat_100g                          321
nutriscore_grade                    1
Numb_nan_line                       0
dtype: int64

## Boxplot 

In [33]:
boxo=['fiber_100g','proteins_100g','fruits-vegetables-nuts_100g',
                'saturated-fat_100g','sugars_100g','salt_100g','carbohydrates_100g','fat_100g']
export_list=[]
export_list.append(['Name','Q1','Q3'])

In [34]:
for i in range(len(boxo)): 
    train.loc[(train[boxo[i]]<0)|(train[boxo[i]]>100),
              boxo[i]]=train.loc[(train[boxo[i]]<0)|(train[boxo[i]]>100),boxo[i]]=np.nan
    Q1 = train[boxo[i]].quantile(0.25)
    Q3 = train[boxo[i]].quantile(0.75)
    IQR = Q3 - Q1
    export_list.append([boxo[i],Q1 - 1.5 * IQR,Q3 + 1.5 * IQR])
    train.loc[(train[boxo[i]] < (Q1 - 1.5 * IQR)) | (train[boxo[i]] > (Q3 + 1.5 * IQR)),
              boxo[i]]=train.loc[(train[boxo[i]] < (Q1 - 1.5 * IQR)) | (train[boxo[i]] > (Q3 + 1.5 * IQR)),
                                 boxo[i]]=np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [35]:
#We are going to store the values of our IQR for each columns as a list 
export_list

[['Name', 'Q1', 'Q3'],
 ['fiber_100g', -5.550000000000001, 9.25],
 ['proteins_100g', -13.499999999999998, 27.299999999999997],
 ['fruits-vegetables-nuts_100g', -81.77499999999999, 161.625],
 ['saturated-fat_100g', -11.799999999999999, 20.199999999999996],
 ['sugars_100g', -21.950000000000003, 38.45],
 ['salt_100g', -1.75, 3.13],
 ['carbohydrates_100g', -66.9, 119.5],
 ['fat_100g', -32.4, 57.2]]

# Comparaison entre les deux méthodes

In [36]:
colo=['energy_100g']
for i in range(len(colo)): 
    Q1 = train[colo[i]].quantile(0.25)
    Q3 = train[colo[i]].quantile(0.75)
    IQR = Q3 - Q1
    methode_boxo=(train.loc[(train[colo[i]] < (Q1 - 1.5 * IQR)) | (train[colo[i]] > (Q3 + 1.5 * IQR)),
                                 colo[i]])
    Q001=train[colo[i]].quantile(0.01)
    Q99=train[colo[i]].quantile(0.99)
    methode_percent=(train.loc[(train[colo[i]] <=Q001) | (train[colo[i]] >= Q99),
                                 colo[i]])

In [37]:
print('Nombre outliers avec la methode boxo',len(methode_boxo))
print('Nombre outliers avec la methode percentile',len(methode_percent))

Nombre outliers avec la methode boxo 2528
Nombre outliers avec la methode percentile 4745


In [38]:
colo=['energy_100g']
for i in range(len(colo)): 
    Q001=train[colo[i]].quantile(0.01)
    Q99=train[colo[i]].quantile(0.99)
    (train.loc[(train[colo[i]] <=Q001) | (train[colo[i]] >= Q99),
                                 colo[i]])
    export_list.append([colo[i],Q001,Q99])
    train.loc[(train[colo[i]] <=Q001) | (train[colo[i]] >= Q99),
              boxo[i]]=train.loc[(train[colo[i]] <=Q001) | (train[colo[i]] >= Q99),
                                 colo[i]]=np.nan

In [39]:
#Here we just want to check if energy was added to our list with her values
export_list

[['Name', 'Q1', 'Q3'],
 ['fiber_100g', -5.550000000000001, 9.25],
 ['proteins_100g', -13.499999999999998, 27.299999999999997],
 ['fruits-vegetables-nuts_100g', -81.77499999999999, 161.625],
 ['saturated-fat_100g', -11.799999999999999, 20.199999999999996],
 ['sugars_100g', -21.950000000000003, 38.45],
 ['salt_100g', -1.75, 3.13],
 ['carbohydrates_100g', -66.9, 119.5],
 ['fat_100g', -32.4, 57.2],
 ['energy_100g', 0.0, 3698.5135000000055]]

# Imputation des données par Knn

In [40]:
knn=['fiber_100g','proteins_100g',
                'energy_100g','saturated-fat_100g','sugars_100g','salt_100g','carbohydrates_100g','fat_100g']

In [41]:
scaler = StandardScaler()
data=(scaler.fit_transform(train[knn]))

In [42]:
with open('StandardScaler_pickle', 'wb') as file: 
    pickle.dump(scaler, file)

In [43]:
#Initialize KNNImputer
imputer = KNNImputer(n_neighbors=2)

In [44]:
train[var_nutri]

Unnamed: 0,fiber_100g,proteins_100g,fruits-vegetables-nuts_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,carbohydrates_100g,fat_100g
874796,4.0,7.000,,2293.0,,,0.16000,53.00,34.000
675852,,3.900,,674.0,0.7000,0.90,1.50000,22.80,5.600
664355,,1.400,,1197.0,0.3000,,0.00000,66.30,1.400
422771,,20.000,,1159.0,15.5000,0.00,1.50000,0.00,21.900
84149,0.0,3.400,,359.0,0.8000,14.20,0.12319,15.40,1.420
1140578,,6.600,,1870.0,9.5000,31.00,0.35000,63.30,17.500
970792,2.6,4.800,,2050.0,10.0000,,0.83000,69.00,21.000
735339,,,,957.0,5.4000,0.60,,0.60,12.500
298436,,20.000,,448.0,1.4000,0.00,0.15000,0.00,3.000
1140652,1.4,0.900,,79.0,0.0000,1.90,0.02000,3.20,0.000


In [45]:
%%time
#3. Impute/Fill Missing Values
df_filled = imputer.fit_transform(data)

CPU times: user 1h 5min 32s, sys: 31min 47s, total: 1h 37min 20s
Wall time: 46min 48s


In [46]:
with open('imputer_pickle', 'wb') as file: 
    pickle.dump(imputer, file)

In [47]:
df_filled

array([[ 0.83825503, -0.09850001,  1.49347596, ..., -0.72687842,
         1.03767706,  1.57405707],
       [ 0.35145024, -0.52675896, -0.60873259, ...,  1.10647918,
        -0.10132823, -0.51413411],
       [-0.37875693, -0.87212908,  0.07036257, ..., -0.94578679,
         1.53929198, -0.82295112],
       ...,
       [-0.15748203, -0.99646233,  0.52612243, ..., -0.82265083,
         2.50857794, -0.88912619],
       [-0.48939438, -0.78924025, -1.30340929, ..., -0.57364256,
        -0.50865463, -0.7788344 ],
       [-0.93194419,  2.11186877, -0.9190648 , ..., -0.79528729,
        -0.96123951, -0.8303039 ]])

# Kdtree

In [48]:
X=df_filled.copy()

In [49]:
tree = KDTree(X, leaf_size=2)              # doctest: +SKIP
dist, ind = tree.query(X, k=5)                # doctest: +SKIP
print(ind)  # indices of 5 closest neighbors
print(dist)  # distances to 5 closest neighbors


[[     0 180787   3254 193912 169954]
 [     1    584  59223 154095 182351]
 [     2  51490  78255  20390 223673]
 ...
 [237013 208860 196800 138807 137281]
 [237014 110394 214165  49616   3430]
 [ 86581  75243 178187 115336  87254]]
[[0.         0.14936229 0.29806429 0.29871837 0.30257219]
 [0.         0.         0.16064925 0.20511922 0.27773599]
 [0.         0.         0.01349386 0.05186745 0.05860025]
 ...
 [0.         0.01463444 0.01463444 0.11270839 0.11889164]
 [0.         0.20646049 0.25055492 0.2571536  0.27504566]
 [0.         0.         0.         0.         0.        ]]


In [50]:
#First we are going to store the mean distance of each product in a list 
dist_mean=[]
for i in range(len(dist)):
    dist_mean.append(dist[i].mean())

In [51]:
#It seems that everything is ok, if we check the size
print('shape of distance de nos produits',dist.shape)
print('shape of mean distance de nos produits',len(dist_mean))

shape of distance de nos produits (237016, 5)
shape of mean distance de nos produits 237016


In [52]:
#We are going to inverse_tranform to get back our values with their initial states
data=(scaler.inverse_transform(df_filled))

In [53]:
#The for some reasons of convenience, we are going to to transform our array as a dataframe 
dataframe=pd.DataFrame(data, columns=['fiber_100g','proteins_100g',
                'energy_100g','saturated-fat_100g','sugars_100g','salt_100g','carbohydrates_100g','fat_100g']) 

In [54]:
dataframe.head()

Unnamed: 0,fiber_100g,proteins_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,carbohydrates_100g,fat_100g
0,4.0,7.0,2293.0,19.5,15.5,0.16,53.0,34.0
1,2.9,3.9,674.0,0.7,0.9,1.5,22.8,5.6
2,1.25,1.4,1197.0,0.3,18.5,0.0,66.3,1.4
3,0.0,20.0,1159.0,15.5,0.0,1.5,0.0,21.9
4,0.0,3.4,359.0,0.8,14.2,0.12319,15.4,1.42


In [55]:
print("dataframe",dataframe.shape)
print("train",train.shape)

dataframe (237016, 8)
train (237016, 16)


In [56]:
dataframe['mean_tree']=dist_mean
dataframe['code']=train['code'].values
dataframe['product_name']=train['product_name'].values
dataframe['nutrition-score-fr_100g']=train['nutrition-score-fr_100g'].values
dataframe['nutriscore_grade']=train['nutriscore_grade'].values

In [57]:
#Now it seems that we took care of some redundants articles with their code but there are others with different names
#Since we are using knn and its an algorithm using the distance we need to drop those products
#For this, since I don't know who is who ? We are going to use those columns as a reference and keep only one of them each time
dataframe.drop_duplicates(subset =['fiber_100g','proteins_100g',
                'energy_100g','saturated-fat_100g','sugars_100g','salt_100g','carbohydrates_100g','fat_100g'], 
                     keep = "first", inplace = True) 

In [58]:
print("dataframe",dataframe.shape)

dataframe (191039, 13)


In [59]:
train_export=dataframe.copy()
train_export.head()

Unnamed: 0,fiber_100g,proteins_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,carbohydrates_100g,fat_100g,mean_tree,code,product_name,nutrition-score-fr_100g,nutriscore_grade
0,4.0,7.0,2293.0,19.5,15.5,0.16,53.0,34.0,0.209743,5413415206304,Chocolat Lait Cannelle de Ceylan,22.0,e
1,2.9,3.9,674.0,0.7,0.9,1.5,22.8,5.6,0.128701,3760054096042,Taboule Oriental,6.0,c
2,1.25,1.4,1197.0,0.3,18.5,0.0,66.3,1.4,0.024792,3701269300432,Pâte à tartiner châtaigne cacao,12.0,d
3,0.0,20.0,1159.0,15.5,0.0,1.5,0.0,21.9,0.054727,3177870001042,Camembert de Normandie,14.0,d
4,0.0,3.4,359.0,0.8,14.2,0.12319,15.4,1.42,0.05276,56920052421,Yoplait Crémeux vanille,2.0,b


In [60]:
zero_tree=train_export.loc[train_export['mean_tree']==0,['mean_tree','product_name']]
print('Ici nous avons',len(zero_tree['product_name'].unique()),'produits avec un mean tree à 0')

Ici nous avons 2020 produits avec un mean tree à 0


# Verification 

In [61]:
train_export['verif']=train_export['proteins_100g']+train_export['carbohydrates_100g']+train_export['fat_100g']

In [62]:
train_export=train_export.loc[(train_export['verif']>=0)&(train_export['verif']<=100),:]

In [63]:
train_export.head()

Unnamed: 0,fiber_100g,proteins_100g,energy_100g,saturated-fat_100g,sugars_100g,salt_100g,carbohydrates_100g,fat_100g,mean_tree,code,product_name,nutrition-score-fr_100g,nutriscore_grade,verif
0,4.0,7.0,2293.0,19.5,15.5,0.16,53.0,34.0,0.209743,5413415206304,Chocolat Lait Cannelle de Ceylan,22.0,e,94.0
1,2.9,3.9,674.0,0.7,0.9,1.5,22.8,5.6,0.128701,3760054096042,Taboule Oriental,6.0,c,32.3
2,1.25,1.4,1197.0,0.3,18.5,0.0,66.3,1.4,0.024792,3701269300432,Pâte à tartiner châtaigne cacao,12.0,d,69.1
3,0.0,20.0,1159.0,15.5,0.0,1.5,0.0,21.9,0.054727,3177870001042,Camembert de Normandie,14.0,d,41.9
4,0.0,3.4,359.0,0.8,14.2,0.12319,15.4,1.42,0.05276,56920052421,Yoplait Crémeux vanille,2.0,b,20.22


In [64]:
outlier_kd=['mean_tree']
for i in range(len(outlier_kd)): 
    train_export=train_export.loc[(train_export[outlier_kd[i]]>=0)|(train_export[outlier_kd[i]]<=100),:]
    Q1 = train_export[outlier_kd[i]].quantile(0.25)
    Q3 = train_export[outlier_kd[i]].quantile(0.75)
    IQR = Q3 - Q1
    export_list.append([outlier_kd[i],Q1 - 1.5 * IQR,Q3 + 1.5 * IQR])
    train_export=train_export.loc[(train_export[outlier_kd[i]] > (Q1 - 1.5 * IQR)) | (train_export[outlier_kd[i]] < (Q3 + 1.5 * IQR)),
                                 :]

# Export 

In [65]:
column_names = export_list.pop(0)
export_outliers = pd.DataFrame(export_list, columns=column_names)
export_outliers

Unnamed: 0,Name,Q1,Q3
0,fiber_100g,-5.55,9.25
1,proteins_100g,-13.5,27.3
2,fruits-vegetables-nuts_100g,-81.775,161.625
3,saturated-fat_100g,-11.8,20.2
4,sugars_100g,-21.95,38.45
5,salt_100g,-1.75,3.13
6,carbohydrates_100g,-66.9,119.5
7,fat_100g,-32.4,57.2
8,energy_100g,0.0,3698.5135
9,mean_tree,-0.215461,0.52624


In [66]:
export_outliers.to_csv('Data/export_outliers.csv',index=False)
train_export.to_csv('Data/train_export.csv',index=False)
test.to_csv('Data/test_export.csv',index=False)

Generalities:

- fields that end with _t are dates in the UNIX timestamp format (number of seconds since Jan 1st 1970)
- fields that end with _datetime are dates in the iso8601 format: yyyy-mm-ddThh:mn:ssZ
- fields that end with _tags are comma separated list of tags (e.g. categories_tags is the set of normalized tags computer from the categories field)
- fields that end with a language 2 letter code (e.g. fr for French) is the set of tags in that language
- fields that end with _100g correspond to the amount of a nutriment (in g, or kJ for energy) for 100 g or 100 ml of product
- fields that end with _serving correspond to the amount of a nutriment (in g, or kJ for energy) for 1 serving of the product

 List of fields:

# general information:

- code : barcode of the product (can be EAN-13 or internal codes for some food stores), for products without a barcode, Open Food Facts assigns a number starting with the 200 reserved prefix
- url : url of the product page on Open Food Facts
- creator : contributor who first added the product
- created_t : date that the product was added (UNIX timestamp format)
- created_datetime : date that the product was added (iso8601 format: yyyy-mm-ddThh:mn:ssZ)
- last_modified_t : date that the product page was last modified
- last_modified_datetime
- product_name : name of the product
- generic_name
- quantity : quantity and unit

# tags:

- packaging : shape, material
- packaging_tags
- brands
- brands_tags
- categories
- categories_tags
- categories_fr
- origins : origins of ingredients
- origins_tags
- manufacturing_places : places where manufactured or transformed
- manufacturing_places_tags
- labels
- labels_tags
- labels_fr
- emb_codes
- emb_codes_tags
- first_packaging_code_geo : coordinates corresponding to the first packaging code indicated
- cities
- cities_tags
- purchase_places
- stores
- countries : list of countries where the product is sold
- countries_tags
- countries_fr

# ingredients:

- ingredients_text
- traces
- traces_tags

# misc. data:

- serving_size : serving size in g
- no_nutriments : indicates if the nutrition facts are indicated on the food label
- additives_n : number of food additives
- additives
- additives_tags
- ingredients_from_palm_oil_n
- ingredients_from_palm_oil
- ingredients_from_palm_oil_tags
- ingredients_that_may_be_from_palm_oil_n
- ingredients_that_may_be_from_palm_oil
- ingredients_that_may_be_from_palm_oil_tags
- nutrition_grade_fr : nutrition grade ('a' to 'e'). see http://fr.openfoodfacts.org/score-nutritionnel-experimental-france
- main_category
- main_category_fr
- image_url
- image_small_url

# nutrition facts:

- energy_100g
- proteins_100g
- casein_100g
- serum-proteins_100g
- nucleotides_100g
- carbohydrates_100g
- sugars_100g
- sucrose_100g
- glucose_100g
- fructose_100g
- lactose_100g
- maltose_100g
- maltodextrins_100g
- starch_100g
- polyols_100g
- fat_100g
- saturated-fat_100g
- butyric-acid_100g
- caproic-acid_100g
- caprylic-acid_100g
- capric-acid_100g
- lauric-acid_100g
- myristic-acid_100g
- palmitic-acid_100g
- stearic-acid_100g
- arachidic-acid_100g
- behenic-acid_100g
- lignoceric-acid_100g
- cerotic-acid_100g
- montanic-acid_100g
- melissic-acid_100g
- monounsaturated-fat_100g
- polyunsaturated-fat_100g
- omega-3-fat_100g
- alpha-linolenic-acid_100g
- eicosapentaenoic-acid_100g
- docosahexaenoic-acid_100g
- omega-6-fat_100g
- linoleic-acid_100g
- arachidonic-acid_100g
- gamma-linolenic-acid_100g
- dihomo-gamma-linolenic-acid_100g
- omega-9-fat_100g
- oleic-acid_100g
- elaidic-acid_100g
- gondoic-acid_100g
- mead-acid_100g
- erucic-acid_100g
- nervonic-acid_100g
- trans-fat_100g
- cholesterol_100g
- fiber_100g
- sodium_100g
- alcohol_100g : % vol of alcohol
- vitamin-a_100g
- vitamin-d_100g
- vitamin-e_100g
- vitamin-k_100g
- vitamin-c_100g
- vitamin-b1_100g
- vitamin-b2_100g
- vitamin-pp_100g
- vitamin-b6_100g
- vitamin-b9_100g
- vitamin-b12_100g
- biotin_100g
- pantothenic-acid_100g
- silica_100g
- bicarbonate_100g
- potassium_100g
- chloride_100g
- calcium_100g
- phosphorus_100g
- iron_100g
- magnesium_100g
- zinc_100g
- copper_100g
- manganese_100g
- fluoride_100g
- selenium_100g
- chromium_100g
- molybdenum_100g
- iodine_100g
- caffeine_100g
- taurine_100g
- ph_100g : pH (no unit)
- fruits-vegetables-nuts_100g : % of fruits, vegetables and nuts (excluding potatoes, yams, manioc)

- carbon-footprint_100g : carbon footprint (indicated on some products)

- nutrition-score-fr_100g : experimental nutrition score derived from the UK FSA score and adapted for the French market (formula defined by the team of Professor Hercberg)
- nutrition-score-uk_100g : nutrition score defined by the UK Food Standards Administration (FSA)