###### imports

In [74]:
import pandas as pd
import numpy as np
from functools import partialmethod


###### defaults

[Set default .head() to 3 rows](https://medium.com/dunder-data/pandas-trick-1-change-the-default-number-of-rows-returned-from-the-head-method-bc7c21ce0d53)

In [75]:
# set default .head() to 3 rows per https://medium.com/dunder-data/
# pandas-trick-1-change-the-default-number-of-rows-returned-from-the-head-
# method-bc7c21ce0d53
pd.DataFrame.head = partialmethod(pd.DataFrame.head, n=3)

###### data cleaning

In [2]:
whfoods = pd.read_csv('./Data/whfoods.csv')

In [3]:
# remove extraneous rows, columns.

whfoods = whfoods.drop([0,175,176]).dropna(how='all',axis=1).dropna(how='all',
                                                                    axis=0)

In [4]:
whfoods.index = range(whfoods.shape[0])

In [5]:
whfoods

Unnamed: 0,"Asparagus, Cooked",Unnamed: 1,Unnamed: 2,"Avocado, cubed, raw",Unnamed: 4,Unnamed: 5,"Beet Greens, boiled",Unnamed: 7,Unnamed: 8,"Beets, sliced, cooked",...,Unnamed: 341,"Sage, dried",Unnamed: 343,Unnamed: 344,"Thyme, fresh",Unnamed: 346,Unnamed: 347,"Turmeric, ground",Unnamed: 349,Unnamed: 350
0,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,...,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV
1,,,(%),,,(%),,,(%),,...,(%),,,(%),,,(%),,,(%)
2,Protein,4.32 g,9,Protein,3.00 g,6,Protein,3.70 g,7,Protein,...,0,Protein,0.15 g,0,Protein,0.27 g,1,Protein,0.34 g,1
3,Carbohydrates,7.40 g,3,Carbohydrates,12.80 g,6,Carbohydrates,7.86 g,3,Carbohydrates,...,0,Carbohydrates,0.85 g,0,Carbohydrates,1.17 g,1,Carbohydrates,2.86 g,1
4,Fat - total,0.40 g,1,Fat - total,21.99 g,28,Fat - total,0.29 g,0,Fat - total,...,0,Fat - total,0.18 g,0,Fat - total,0.08 g,0,Fat - total,0.43 g,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),...,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,
169,Aspartame,-- mg,,Aspartame,-- mg,,Aspartame,-- mg,,Aspartame,...,,Aspartame,-- mg,,Aspartame,-- mg,,Aspartame,-- mg,
170,Saccharin,-- mg,,Saccharin,-- mg,,Saccharin,-- mg,,Saccharin,...,,Saccharin,-- mg,,Saccharin,-- mg,,Saccharin,-- mg,
171,Alcohol,0.00 g,,Alcohol,0.00 g,,Alcohol,0.00 g,,Alcohol,...,,Alcohol,0.00 g,,Alcohol,0.00 g,,Alcohol,0.00 g,


In [6]:
# investigating dimensions of dataset

nfoods = len(whfoods.columns)/3 # there are 117 foods in the dataset.
nfoods
nnutrients = whfoods.shape[0]
whfoods.shape # There are 

(173, 351)

In [7]:
whfoods

Unnamed: 0,"Asparagus, Cooked",Unnamed: 1,Unnamed: 2,"Avocado, cubed, raw",Unnamed: 4,Unnamed: 5,"Beet Greens, boiled",Unnamed: 7,Unnamed: 8,"Beets, sliced, cooked",...,Unnamed: 341,"Sage, dried",Unnamed: 343,Unnamed: 344,"Thyme, fresh",Unnamed: 346,Unnamed: 347,"Turmeric, ground",Unnamed: 349,Unnamed: 350
0,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,...,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV
1,,,(%),,,(%),,,(%),,...,(%),,,(%),,,(%),,,(%)
2,Protein,4.32 g,9,Protein,3.00 g,6,Protein,3.70 g,7,Protein,...,0,Protein,0.15 g,0,Protein,0.27 g,1,Protein,0.34 g,1
3,Carbohydrates,7.40 g,3,Carbohydrates,12.80 g,6,Carbohydrates,7.86 g,3,Carbohydrates,...,0,Carbohydrates,0.85 g,0,Carbohydrates,1.17 g,1,Carbohydrates,2.86 g,1
4,Fat - total,0.40 g,1,Fat - total,21.99 g,28,Fat - total,0.29 g,0,Fat - total,...,0,Fat - total,0.18 g,0,Fat - total,0.08 g,0,Fat - total,0.43 g,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),...,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,,Artificial Sweeteners (Total),-- mg,
169,Aspartame,-- mg,,Aspartame,-- mg,,Aspartame,-- mg,,Aspartame,...,,Aspartame,-- mg,,Aspartame,-- mg,,Aspartame,-- mg,
170,Saccharin,-- mg,,Saccharin,-- mg,,Saccharin,-- mg,,Saccharin,...,,Saccharin,-- mg,,Saccharin,-- mg,,Saccharin,-- mg,
171,Alcohol,0.00 g,,Alcohol,0.00 g,,Alcohol,0.00 g,,Alcohol,...,,Alcohol,0.00 g,,Alcohol,0.00 g,,Alcohol,0.00 g,


In [8]:
whfoods.iloc[:,0].values

array(['nutrient', nan, 'Protein', 'Carbohydrates', 'Fat - total',
       'Dietary Fiber', 'Calories', 'MACRONUTRIENT AND CALORIE DETAIL',
       'nutrient', nan, 'Carbohydrate:', 'Starch', 'Total Sugars',
       'Monosaccharides', 'Fructose', 'Glucose', 'Galactose',
       'Disaccharides', 'Lactose', 'Maltose', 'Sucrose', 'Soluble Fiber',
       'Insoluble Fiber', 'Other Carbohydrates', 'Fat:',
       'Monounsaturated Fat', 'Polyunsaturated Fat', 'Saturated Fat',
       'Trans Fat', 'Calories from Fat', 'Calories from Saturated Fat',
       'Calories from Trans Fat', 'Cholesterol', 'Water',
       'MICRONUTRIENTS', 'nutrient', nan, 'Vitamins',
       'Water-Soluble Vitamins', 'B-Complex Vitamins', 'Vitamin B1',
       'Vitamin B2', 'Vitamin B3', 'Vitamin B3 (Niacin Equivalents)',
       'Vitamin B6', 'Vitamin B12', 'Biotin', 'Choline', 'Folate',
       'Folate (DFE)', 'Folate (food)', 'Pantothenic Acid', 'Vitamin C',
       'Fat-Soluble Vitamins', 'Vitamin A (Retinoids and Carotenoids

There are under 173 nutrient categories since some of the rows correspond to supercategories such as 'Minerals','INDIVIDUAL FATTY ACIDS', 'Monounsaturated Fats', 'INDIVIDUAL AMINO ACIDS', 'OTHER COMPONENTS', etc. as well as corresponding extraneous rows such as: 'nutrient', nan.

The dataset could be simplified by removing rows with all 0.00 g, mg etc or all -- mgsuch as with the sweeteners, caffeine, alcohol.

I'll create two dictionaries, one of the foods and the raw nutrient values and another of the foods and DRI/DV.  These can be used to generate data frames.  The raw nutrient values and the DRI/DV info can be used to generate the 100% DRI/DV values for each nutrient.  Rounding errors can be overcome by using the food with the highest DRI/DV for a given nutrient to generate the recommendation.

In [27]:
[print((i,j)) for i,j in enumerate(['a','e','i'])]

(0, 'a')
(1, 'e')
(2, 'i')


[None, None, None]

In [38]:
# gather all the foods into a list
foods = [food for inx,food in enumerate(whfoods.columns) if inx%3 ==0]
len(foods) # verify the number of foods

In [46]:
nnutrients

173

In [47]:
nnutrients

173

In [44]:
# collect a nested dictionary of... {food: {nutrient:nutrient_val}}
nutrient_vals = \
{food:
    {whfoods.loc[i,food]: 
     whfoods.iloc[i,int(np.where(whfoods.columns.values==food)[0][0])+1] 
     for i in range (2, nnutrients)
    } for food in foods
}

In [76]:
nutrient_drv = \
{food:
    {whfoods.loc[i,food]: 
     whfoods.iloc[i,int(np.where(whfoods.columns.values==food)[0][0])+2] 
     for i in range (2, nnutrients)
    } for food in foods
}

In [78]:
nutrient_vals_df = pd.DataFrame(nutrient_vals)
nutrient_vals_df.head()

In [83]:
# Transposing so that the foods are like "observations"
nutrient_vals_df = nutrient_vals_df.T

In [79]:
nutrient_drv_df = pd.DataFrame(nutrient_drv)

In [84]:
nutrient_drv_df = nutrient_drv_df.T

In [128]:
nutrient_drv_df.index[np.where(nutrient_drv_df['Protein'].isna().values==1)[0][0]]

'Cheese, grass fed, cheddar, whole milk'

In [130]:
nutrient_drv_df.loc['Cheese, grass fed, cheddar, whole milk',:]

Protein                          NaN
Carbohydrates                    NaN
Fat - total                      NaN
Dietary Fiber                    NaN
Calories                         NaN
                                ... 
Artificial Sweeteners (Total)    NaN
Aspartame                        NaN
Saccharin                        NaN
Alcohol                          NaN
Caffeine                         NaN
Name: Cheese, grass fed, cheddar, whole milk, Length: 161, dtype: object

In [110]:
for nutrient in nutrient_drv_df.columns.values[[0,1]]:
    I
    try: nutrient_drv_df[[nutrient]] = nutrient_drv_df[[nutrient]].astype("int")
    except: pass

In [None]:
nutrient_drv_df

In [98]:
nutrient_drv_df.nunique()[10:40]

Total Sugars                        0
Monosaccharides                     0
Fructose                            0
Glucose                             0
Galactose                           0
Disaccharides                       0
Lactose                             0
Maltose                             0
Sucrose                             0
Soluble Fiber                       0
Insoluble Fiber                     0
Other Carbohydrates                 0
Fat:                                0
Monounsaturated Fat                 0
Polyunsaturated Fat                 0
Saturated Fat                       0
Trans Fat                           0
Calories from Fat                   0
Calories from Saturated Fat         0
Calories from Trans Fat             0
Cholesterol                         0
Water                               0
MICRONUTRIENTS                      0
Vitamins                            0
Water-Soluble Vitamins              0
B-Complex Vitamins                  0
Vitamin B1  

To simplify & clean the dataset, I'll collect the "nutrient_drv" columns that don't carry any information, i.e. have the same values for all the foods.

In [None]:
[nutrient_drv_df[]for ]