###### imports

In [543]:
import pandas as pd # for dataframe analysis
import numpy as np # for arrays functionality, i.e. .where()...
import re
from functools import partialmethod # for changing default of pandas .head()
import difflib # compare naming of nutrients between constraints and foods DFs

###### attributions

[1) Set default .head() to 3 rows -- Ted Petrou, Dunder Data](https://medium.com/dunder-data/pandas-trick-1-change-the-default-number-of-rows-returned-from-the-head-method-bc7c21ce0d53)

###### set defaults

In [544]:
pd.DataFrame.head = partialmethod(pd.DataFrame.head, n=3)

###### data cleaning

In [545]:
whfoods = pd.read_csv('./Data/whfoods.csv')

In [546]:
whfoods.index = range(whfoods.shape[0])

In [547]:
whfoods.head()

Unnamed: 0,"Asparagus, Cooked",Unnamed: 1,Unnamed: 2,"Avocado, cubed, raw",Unnamed: 4,Unnamed: 5,"Beet Greens, boiled",Unnamed: 7,Unnamed: 8,"Beets, sliced, cooked",...,Unnamed: 341,"Sage, dried",Unnamed: 343,Unnamed: 344,"Thyme, fresh",Unnamed: 346,Unnamed: 347,"Turmeric, ground",Unnamed: 349,Unnamed: 350
0,BASIC MACRONUTRIENTS AND CALORIES,,,BASIC MACRONUTRIENTS AND CALORIES,,,BASIC MACRONUTRIENTS AND CALORIES,,,BASIC MACRONUTRIENTS AND CALORIES,...,,BASIC MACRONUTRIENTS AND CALORIES,,,BASIC MACRONUTRIENTS AND CALORIES,,,BASIC MACRONUTRIENTS AND CALORIES,,
1,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,...,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV,nutrient,amount,DRI/DV
2,,,(%),,,(%),,,(%),,...,(%),,,(%),,,(%),,,(%)


In [548]:
# investigating dimensions of dataset

nfoods = len(whfoods.columns)/3 # there are 117 foods in the dataset.
nfoods
whfoods.shape
nnutrients = whfoods.shape[0]

174 "nutrients," though many of these will be not real nutrients.

**Initial assessment of dataset**\
\
*SHAPE:* There are three columns per food group, 174 rows, 351 columns.  \
\
*INFORMATION:* There is no serving size in grams of the food, though from inspecting the excel spreadsheet it appears possible to reconstruct this value from the amounts of the macronutrients, micronutrients, water, and ash.\
\
*FORMATTING:* The three columns indicate first the description of the component of the food, the amount in grams, milligrams, or micrograms, and the DRI/DV if applicable.

**Action Items**
1) Generate two dataframes -- one for amount, one for DRI/DV
2) Standardize missing value indicators
3) Eliminate rows with one unique value.
4) Separate out units for the amount df into a reference dictionary of {nutrient1: 'mg', nutrient2: 'g', nutrient3: 'mcg', ...}
5) Build the LP problem with PuLP

In [549]:
# initial pass on removing extraneous rows, columns.
whfoods = whfoods.dropna(how='all',axis=1).dropna(how='all', axis=0)

There are under 173 nutrient categories since some of the rows correspond to supercategories such as 'Minerals','INDIVIDUAL FATTY ACIDS', 'Monounsaturated Fats', 'INDIVIDUAL AMINO ACIDS', 'OTHER COMPONENTS', etc. as well as corresponding extraneous rows such as: 'nutrient', nan.

The dataset could be filtered by removing rows with all 0.00 g, mg etc or all -- mg such as with the sweeteners, caffeine, alcohol.

I'll create two dictionaries, one of the foods and the raw nutrient values and another of the foods and DRI/DV.  These can be used to generate data frames.  The raw nutrient values and the DRI/DV info can be used to generate the 100% DRI/DV values for each nutrient.  Rounding errors can be minimized by using the food with the highest DRI/DV for a given nutrient to generate the recommendation.

###### column name formatting

In [550]:
# gather all the foods into a list
foods = [food for inx,food in enumerate(whfoods.columns) if inx%3 ==0]
len(foods) # verify the number of foods

117

In [551]:
new_cols = []

for i, food in enumerate(foods):
    # Replace spaces, then commas, then dunders, and finally make lowercase
    food = re.sub(r'_+', '_', 
                  re.sub(r',+', '_', 
                         re.sub(r' +', '_', food))).lower()

    new_cols = new_cols + [food, f'nv_{food}', f'drv_{food}']

whfoods.columns = new_cols
foods = [food for inx,food in enumerate(whfoods.columns) if inx%3 ==0]

In [552]:
whfoods.columns

Index(['asparagus_cooked', 'nv_asparagus_cooked', 'drv_asparagus_cooked',
       'avocado_cubed_raw', 'nv_avocado_cubed_raw', 'drv_avocado_cubed_raw',
       'beet_greens_boiled', 'nv_beet_greens_boiled', 'drv_beet_greens_boiled',
       'beets_sliced_cooked',
       ...
       'drv_rosemary_fresh', 'sage_dried', 'nv_sage_dried', 'drv_sage_dried',
       'thyme_fresh', 'nv_thyme_fresh', 'drv_thyme_fresh', 'turmeric_ground',
       'nv_turmeric_ground', 'drv_turmeric_ground'],
      dtype='object', length=351)

###### separate nutrient val, drv info

In [553]:
# collect a nested dictionary of... {food: {nutrient:nutrient_val}}
nutrient_vals = \
{food:
    {whfoods.loc[i,food]: 
     whfoods.iloc[i,int(np.where(whfoods.columns.values==food)[0][0])+1] 
     for i in range (2, nnutrients)
    } for food in foods
}

# collect a nested dictionary of... {food: {nutrient:nutrient_drv}}
nutrient_drv = \
{food:
    {whfoods.loc[i,food]: 
     whfoods.iloc[i,int(np.where(whfoods.columns.values==food)[0][0])+2] 
     for i in range (2, nnutrients)
    } for food in foods
}

# convert to DatFrame
nv_df = pd.DataFrame(nutrient_vals)
drv_df = pd.DataFrame(nutrient_drv)

# Transposing so that the foods are like "observations" in long format
nv_df = nv_df.T
drv_df = drv_df.T

###### Cleaning

Remove all columns that have uniform values

In [554]:
# credit: ChatGPT
def remove_columns_with_same_value(df):
    unique_counts = df.nunique()
    columns_to_remove = unique_counts[(unique_counts == 1)|(unique_counts == 0)].index
    df = df.drop(columns=columns_to_remove)
    return df

In [555]:
nv_df = remove_columns_with_same_value(nv_df)

In [556]:
nv_df.head()

Unnamed: 0,Protein,Carbohydrates,Fat - total,Dietary Fiber,Calories,Starch,Total Sugars,Monosaccharides,Fructose,Glucose,...,Sugar Alcohols (Total),Glycerol,Inositol,Mannitol,Sorbitol,Xylitol,Artificial Sweeteners (Total),Aspartame,Saccharin,Caffeine
asparagus_cooked,4.32 g,7.40 g,0.40 g,3.60 g,39.6,-- g,2.34 g,2.18 g,1.42 g,0.76 g,...,-- g,-- g,-- g,-- g,-- g,-- g,-- mg,-- mg,-- mg,0.00 mg
avocado_cubed_raw,3.00 g,12.80 g,21.99 g,10.05 g,240.0,-- g,0.99 g,0.89 g,0.18 g,0.56 g,...,-- g,-- g,-- g,-- g,-- g,-- g,-- mg,-- mg,-- mg,0.00 mg
beet_greens_boiled,3.70 g,7.86 g,0.29 g,4.18 g,38.88,-- g,0.86 g,-- g,-- g,-- g,...,-- g,-- g,-- g,-- g,-- g,-- g,-- mg,-- mg,-- mg,0.00 mg


In [557]:
drv_df = remove_columns_with_same_value(drv_df)

In [558]:
drv_df.head()

Unnamed: 0,Protein,Carbohydrates,Fat - total,Dietary Fiber,Calories,Vitamin B1,Vitamin B2,Vitamin B3,Vitamin B6,Vitamin B12,...,Iron,Magnesium,Manganese,Molybdenum,Phosphorus,Potassium,Selenium,Sodium,Zinc,Omega-3 Fatty Acids
asparagus_cooked,9,3,1,13,2,24,19,12,8,0,...,9,6,12,--,14,9,20,2,10,2
avocado_cubed_raw,6,6,28,36,13,8,15,16,23,0,...,5,10,9,--,11,15,1,1,9,8
beet_greens_boiled,7,3,0,15,2,14,32,5,11,0,...,15,23,32,--,8,28,2,23,7,0


Reformatting columns for nv_df & drv_df

In [559]:
def format_cols(df):
    pattern = r'[^a-zA-Z0-9_]+'
    form_cols = [re.sub(pattern, '_', col) for col in df.columns]
    form_cols = [re.sub('_+$', '', col).lower() for col in form_cols]
    df.columns = form_cols
    
    return df

In [560]:
nv_df = format_cols(nv_df)
drv_df = format_cols(drv_df)

###### Formatting

Replacing '--' with '0.00', assuming that this is correct.  Another interpretation is that -- represents no record, so this may need to be instead replaced with NaN.

In [561]:
for col in nv_df.columns:
    nv_df[col] = nv_df[col].str.replace('--','0.00')

In [562]:
nv_df = remove_columns_with_same_value(nv_df)

In [563]:
nv_df.head()

Unnamed: 0,protein,carbohydrates,fat_total,dietary_fiber,calories,starch,total_sugars,monosaccharides,fructose,glucose,...,tryptophan,tyrosine,valine,ash,organic_acids_total,acetic_acid,citric_acid,malic_acid,sugar_alcohols_total,xylitol
asparagus_cooked,4.32 g,7.40 g,0.40 g,3.60 g,39.6,0.00 g,2.34 g,2.18 g,1.42 g,0.76 g,...,0.05 g,0.10 g,0.22 g,1.13 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g
avocado_cubed_raw,3.00 g,12.80 g,21.99 g,10.05 g,240.0,0.00 g,0.99 g,0.89 g,0.18 g,0.56 g,...,0.04 g,0.07 g,0.16 g,2.37 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g
beet_greens_boiled,3.70 g,7.86 g,0.29 g,4.18 g,38.88,0.00 g,0.86 g,0.00 g,0.00 g,0.00 g,...,0.06 g,0.09 g,0.11 g,3.80 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g


###### Strip units, create {nutrient: unit} dictionary

The whfoods data contain the following units:\
mg (ATE), mcg (RE), mcg (RAE), IU, g, mg, mcg

In [564]:
# Generate a dictionary of units for the different info types (serving_size, 
# calories, etc.)

units_dict = {}
units_pattern = re.compile(r"[mcgiu\)\(]+",re.I) # generate pattern object
for r in range(1,len(nv_df.columns)):  # for each of the nutrient labels
    s = nv_df.iat[0,r] # get nutrient value
    m = units_pattern.search(s) # check pattern object against nutrient value, 
                        # generating match object
    try:
        units_dict[nv_df.columns[r]] = m.group()
    except:
        i=0
        while not m:
            # loop through until we have a unit or decide there are no units
            i = i+1 # go to next value
            s = nv_df.iat[i,r] # store value
            m = units_pattern.search(s) # search value against the pattern

            try: # store match if there is one
                units[nv_df.columns[r]] = m.group() 
            except: 
                if i==nv_df.shape[0]-1: # if no units found by end, None units
                    m = "None"
                    units_dict[nv_df.columns[r]] = m

In [565]:
units_dict

{'carbohydrates': 'g',
 'fat_total': 'g',
 'dietary_fiber': 'g',
 'calories': 'None',
 'starch': 'g',
 'total_sugars': 'g',
 'monosaccharides': 'g',
 'fructose': 'g',
 'glucose': 'g',
 'galactose': 'g',
 'disaccharides': 'g',
 'lactose': 'g',
 'maltose': 'g',
 'sucrose': 'g',
 'soluble_fiber': 'g',
 'insoluble_fiber': 'g',
 'other_carbohydrates': 'g',
 'monounsaturated_fat': 'g',
 'polyunsaturated_fat': 'g',
 'saturated_fat': 'g',
 'trans_fat': 'g',
 'calories_from_fat': 'None',
 'calories_from_saturated_fat': 'None',
 'calories_from_trans_fat': 'None',
 'cholesterol': 'mg',
 'water': 'g',
 'vitamin_b1': 'mg',
 'vitamin_b2': 'mg',
 'vitamin_b3': 'mg',
 'vitamin_b3_niacin_equivalents': 'mg',
 'vitamin_b6': 'mg',
 'vitamin_b12': 'mcg',
 'biotin': 'mcg',
 'choline': 'mg',
 'folate': 'mcg',
 'folate_dfe': 'mcg',
 'folate_food': 'mcg',
 'pantothenic_acid': 'mg',
 'vitamin_c': 'mg',
 'vitamin_a_international_units_iu': 'IU',
 'vitamin_a_mcg_retinol_activity_equivalents_rae': 'mcg',
 'vitamin

In [566]:
# convert drv from str to int
for nutrient in drv_df.columns.values[[0,1]]:
    try: 
        drv_df[[nutrient]] = drv_df[[nutrient]].astype("int")
    except: pass

In [567]:
nv_df.head()

Unnamed: 0,protein,carbohydrates,fat_total,dietary_fiber,calories,starch,total_sugars,monosaccharides,fructose,glucose,...,tryptophan,tyrosine,valine,ash,organic_acids_total,acetic_acid,citric_acid,malic_acid,sugar_alcohols_total,xylitol
asparagus_cooked,4.32 g,7.40 g,0.40 g,3.60 g,39.6,0.00 g,2.34 g,2.18 g,1.42 g,0.76 g,...,0.05 g,0.10 g,0.22 g,1.13 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g
avocado_cubed_raw,3.00 g,12.80 g,21.99 g,10.05 g,240.0,0.00 g,0.99 g,0.89 g,0.18 g,0.56 g,...,0.04 g,0.07 g,0.16 g,2.37 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g
beet_greens_boiled,3.70 g,7.86 g,0.29 g,4.18 g,38.88,0.00 g,0.86 g,0.00 g,0.00 g,0.00 g,...,0.06 g,0.09 g,0.11 g,3.80 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g,0.00 g


In [568]:
def float_match(float_pattern, x):
    float_val = np.nan
    try:
         float_val = float(re.match(float_pattern,x)[0])
    except: 
        float_val = float(x)
            
    return float_val

In [569]:
float_pattern = r'^\d+\.\d+'
for col in nv_df.columns:
    nv_df[col] = nv_df[col].apply(lambda x: float_match(float_pattern, x))

In [570]:
nv_df = remove_columns_with_same_value(nv_df)
drv_df = remove_columns_with_same_value(drv_df)

##### Removing duplicate nutrient columns

The columns that need investigating are: 
* 'Folate', 'Folate (DFE)', and 'Folate (food)' - same for almost all foods.  Keeping 'Folate (DFE)'
* 'Vitamin B3', 'Vitamin B3 (Niacin Equivalents)'
*  'Vitamin A International Units (IU)',
       'Vitamin A mcg Retinol Activity Equivalents (RAE)',
       'Vitamin A mcg Retinol Equivalents (RE)',
       'Retinol mcg Retinol Equivalents (RE)',
       'Carotenoid mcg Retinol Equivalents (RE)', 'Alpha-Carotene'
* 'Vitamin E mg Alpha-Tocopherol Equivalents (ATE)',
       'Vitamin E International Units (IU)', 'Vitamin E mg',

In [571]:
def find_duplicate_columns(df):
    """
    Find columns with the same values in a DataFrame.
    Returns a list of column names with duplicates.
    """
    duplicate_columns = []
    for column1 in df.columns:
        for column2 in df.columns:
            if column1 != column2 and (df[column1] == df[column2]).all():
                duplicate_columns.append(column1)
                break
    return duplicate_columns

In [572]:
dup_cols_nv = find_duplicate_columns(nv_df)
dup_cols_drv = find_duplicate_columns(drv_df)

In [573]:
dup_cols_nv

['folate',
 'folate_food',
 'vitamin_e_mg_alpha_tocopherol_equivalents_ate',
 'vitamin_e_mg',
 'acetic_acid',
 'sugar_alcohols_total',
 'xylitol']

In [574]:
nv_df[dup_cols_nv].head()

Unnamed: 0,folate,folate_food,vitamin_e_mg_alpha_tocopherol_equivalents_ate,vitamin_e_mg,acetic_acid,sugar_alcohols_total,xylitol
asparagus_cooked,268.2,268.2,2.7,2.7,0.0,0.0,0.0
avocado_cubed_raw,121.5,121.5,3.11,3.11,0.0,0.0,0.0
beet_greens_boiled,20.16,20.16,2.61,2.61,0.0,0.0,0.0


I'll drop ['vitamin_e_mg',
 'acetic_acid',
 'sugar_alcohols_total',
 'xylitol'] from nv_df since the first of these is a duplicate and the others have no informative values.

In [575]:
nv_df = nv_df.drop(['folate_food', 'vitamin_e_mg', 'acetic_acid', 
                    'sugar_alcohols_total', 'xylitol'],axis=1)

In [576]:
nv_df.shape # there are now 125 nutrients in nv_df

(117, 124)

###### Vitamin D

In [577]:
nv_df[~(nv_df.filter(regex='vitamin_d',axis=1) == 0).all(axis=1)].filter(
    regex='vitamin_d',axis=1)['vitamin_d_international_units_iu'].values/\
nv_df[~(nv_df.filter(regex='vitamin_d',axis=1) == 0).all(axis=1)].filter(
    regex='vitamin_d',axis=1)['vitamin_d_mcg']

mushrooms_crimini_raw                       30.857143
mushrooms_shiitake_cooked                   39.803922
chicken_pasture-raised_breast_roasted       51.545455
lamb_grass-fed_lean_loin_roasted            20.636364
turkey_pasture-raised_light_meat_roasted    33.352941
cheese_grass_fed_cheddar_whole_milk         40.000000
cow's_milk_grass-fed                        39.132075
eggs_pasture-raised_large_hard_boiled       39.545455
yogurt_grass_fed_whole_milk                 19.600000
cod_pacific_fillet_baked                    40.029412
salmon_wild_coho_broiled                    39.924278
sardines_atlantic_canned                    40.250575
scallops_steamed                                  inf
shrimp_large_steamed                        41.272727
tuna_yellowfin_fillet_baked                 40.964758
Name: vitamin_d_mcg, dtype: float64

###### Folate

In [578]:
folate_cols = [col for col in nv_df.columns if 'folate' in col]
folate_cols

['folate', 'folate_dfe']

folate_cols

In [579]:
nv_df[folate_cols].apply(lambda x: x.nunique() == 1,axis = 1).sum()

116

One food does not have the same values for each folate category. 

In [580]:
nv_df[nv_df[folate_cols].\
                 apply(lambda x: x.nunique() != 1,axis = 1).values][folate_cols]

Unnamed: 0,folate,folate_dfe
sea_vegetables_dulse_dried,0.0,45.6


I'll drop ['Folate', 'Folate (food)'] columns and keep the descriptive 'Folate (DFE)' but rename it to the more concise 'folate'

In [581]:
nv_df = nv_df.drop(columns = ['folate'])

In [582]:
nv_df.rename({'folate_dfe':'folate'},axis=1,inplace=True)

###### Vitamin E

In [583]:
ve_cols = [col for col in nv_df.columns if 'vitamin_e' in col]
ve_cols

['vitamin_e_mg_alpha_tocopherol_equivalents_ate',
 'vitamin_e_international_units_iu']

In [584]:
nv_df[ve_cols]

Unnamed: 0,vitamin_e_mg_alpha_tocopherol_equivalents_ate,vitamin_e_international_units_iu
asparagus_cooked,2.70,4.02
avocado_cubed_raw,3.11,4.63
beet_greens_boiled,2.61,3.88
beets_sliced_cooked,0.07,0.10
bell_peppers_sliced_red_raw,1.45,2.17
...,...,...
peppermint_leaves_fresh,0.00,0.00
rosemary_fresh,0.00,0.00
sage_dried,0.10,0.16
thyme_fresh,0.00,0.00


I'll get rid of vitamin_e_international_units_iu and keep only the more modern alpha tocopherol equivalent and simplify the name to vitamin_e.

In [585]:
nv_df = nv_df.drop(columns=['vitamin_e_international_units_iu']).rename({'vitamin_e_mg_alpha_tocopherol_equivalents_ate':'vitamin_e'},axis=1)

In [586]:
nv_df.columns.values

array(['protein', 'carbohydrates', 'fat_total', 'dietary_fiber',
       'calories', 'starch', 'total_sugars', 'monosaccharides',
       'fructose', 'glucose', 'galactose', 'disaccharides', 'lactose',
       'maltose', 'sucrose', 'soluble_fiber', 'insoluble_fiber',
       'other_carbohydrates', 'monounsaturated_fat',
       'polyunsaturated_fat', 'saturated_fat', 'trans_fat',
       'calories_from_fat', 'calories_from_saturated_fat',
       'calories_from_trans_fat', 'cholesterol', 'water', 'vitamin_b1',
       'vitamin_b2', 'vitamin_b3', 'vitamin_b3_niacin_equivalents',
       'vitamin_b6', 'vitamin_b12', 'biotin', 'choline', 'folate',
       'pantothenic_acid', 'vitamin_c',
       'vitamin_a_international_units_iu',
       'vitamin_a_mcg_retinol_activity_equivalents_rae',
       'vitamin_a_mcg_retinol_equivalents_re',
       'retinol_mcg_retinol_equivalents_re',
       'carotenoid_mcg_retinol_equivalents_re', 'alpha_carotene',
       'beta_carotene', 'beta_carotene_equivalents', 'cryp

###### Vitamin A

In [587]:
va_cols =  nv_df.filter(regex='vitamin_a|carotenoid|retinol').columns
va_cols

Index(['vitamin_a_international_units_iu',
       'vitamin_a_mcg_retinol_activity_equivalents_rae',
       'vitamin_a_mcg_retinol_equivalents_re',
       'retinol_mcg_retinol_equivalents_re',
       'carotenoid_mcg_retinol_equivalents_re'],
      dtype='object')

In [588]:
va_df = nv_df[nv_df[va_cols].\
                 apply(lambda x: x.nunique() != 1,axis = 1).values][
            va_cols]

To compare, I'll extract the floating point numbers from the data.

In [589]:
sum(va_df[va_cols[2]] == va_df[va_cols[4]])
# 104 foods are the same for 'Vitamin A mcg Retinol Equivalents (RE)' and 'Carotenoid mcg Retinol Equivalents (RE)'

va_df[va_df[va_cols[2]] != va_df[va_cols[4]]][[va_cols[2],va_cols[4]]]

Unnamed: 0,vitamin_a_mcg_retinol_equivalents_re,carotenoid_mcg_retinol_equivalents_re
chicken_pasture-raised_breast_roasted,0.0,6.8
lamb_grass-fed_lean_loin_roasted,0.0,34.23
turkey_pasture-raised_light_meat_roasted,3.4,0.0
cheese_grass_fed_cheddar_whole_milk,77.11,3.37
cow's_milk_grass-fed,2.44,57.34
eggs_pasture-raised_large_hard_boiled,1.0,75.0
yogurt_grass_fed_whole_milk,2.04,68.19
cod_pacific_fillet_baked,0.0,2.27
salmon_wild_coho_broiled,0.0,57.83
sardines_atlantic_canned,0.0,29.03


In [590]:
va_df

Unnamed: 0,vitamin_a_international_units_iu,vitamin_a_mcg_retinol_activity_equivalents_rae,vitamin_a_mcg_retinol_equivalents_re,retinol_mcg_retinol_equivalents_re,carotenoid_mcg_retinol_equivalents_re
asparagus_cooked,1810.80,90.54,181.08,0.0,181.08
avocado_cubed_raw,219.00,10.95,21.90,0.0,21.90
beet_greens_boiled,11021.76,551.09,1102.18,0.0,1102.18
beets_sliced_cooked,59.50,2.98,5.95,0.0,5.95
bell_peppers_sliced_red_raw,2880.52,144.03,288.05,0.0,288.05
...,...,...,...,...,...
parsley_chopped_fresh,2560.89,128.04,256.09,0.0,256.09
peppermint_leaves_fresh,322.85,16.14,32.28,0.0,32.28
rosemary_fresh,122.81,6.14,12.28,0.0,12.28
sage_dried,82.60,4.13,8.26,0.0,8.26


No two columns are the same, so for now I'll keep all columns that pertain to Vitamin A

Background:

Retinol Equivalents or RE is used by the World Health Organization and Food and Agriculture Organization while Retinol Activity Equivalents or RAE is used by the FDA.  They are based on different conversion rates of carotenoids into vitamin A while having the same conversion for the animal based retinol, with the RE being more optimistic for conversion rates while also corresponding to a lower recommended minimum intake for mcg RE than the FDA's recommendation for mcg RAE.  \
\
Provitamin A, or Carotenoids, are found only in plants while preformed vitamin A or retinol is found only in animals.  Carotenoids are not toxic at high doses while retinol is.\
\
Constraining the vitamin A contributed by animal products while counting plant sources of carotenoid precursors (Beta- Carotene and Cryptoxanthin) to vitamin A towards minimum requirements looks like a promising potential solution.  

In [591]:
carotenoid_cols = nv_df.filter(regex='carotene|cryptoxanthin').columns.values

In [592]:
va_df[carotenoid_cols] = nv_df[carotenoid_cols]

To avoid deleting useful information, I'll simply create a new column 'vitamin_a' which will be a duplicate of 'vitamin_a_mcg_retinol_activity_equivalents_rae' since this is the new FDA standard.

In [593]:
nv_df['vitamin_a'] = nv_df['vitamin_a_mcg_retinol_activity_equivalents_rae']

###### Vitamin B3

In [594]:
vb3_cols = nv_df.filter(regex='vitamin_b3').columns.values
vb3_cols

array(['vitamin_b3', 'vitamin_b3_niacin_equivalents'], dtype=object)

In [595]:
unique_rows = nv_df[nv_df[vb3_cols].\
                 apply(lambda x: x.nunique() != 1,axis = 1).values][vb3_cols]


In [596]:
unique_rows

Unnamed: 0,vitamin_b3,vitamin_b3_niacin_equivalents
asparagus_cooked,1.95,2.82
avocado_cubed_raw,2.61,3.23
beet_greens_boiled,0.72,1.68
beets_sliced_cooked,0.56,1.13
bell_peppers_sliced_red_raw,0.90,1.08
...,...,...
oregano_leaf_dried,0.09,0.16
parsley_chopped_fresh,0.40,0.63
peppermint_leaves_fresh,0.13,0.20
rosemary_fresh,0.04,0.07


Vitamin_b3 and vitamin_b3_niacin_equivalents are largely different.  60 mg tryptophan contributes 1 mg of niacin according to [Niacin - Harvard Health](https://www.hsph.harvard.edu/nutritionsource/niacin-vitamin-b3/#:~:text=RDA%3A%20Niacin%20is%20measured%20in,mg%20NE%20for%20lactating%20women.)    

In [597]:
nv_df.rename(columns = {'vitamin_b3_niacin_equivalents':'niacin'}, 
             inplace = True)

In [598]:
nv_df.head()

Unnamed: 0,protein,carbohydrates,fat_total,dietary_fiber,calories,starch,total_sugars,monosaccharides,fructose,glucose,...,serine,threonine,tryptophan,tyrosine,valine,ash,organic_acids_total,citric_acid,malic_acid,vitamin_a
asparagus_cooked,4.32,7.4,0.4,3.6,39.6,0.0,2.34,2.18,1.42,0.76,...,0.21,0.17,0.05,0.1,0.22,1.13,0.0,0.0,0.0,90.54
avocado_cubed_raw,3.0,12.8,21.99,10.05,240.0,0.0,0.99,0.89,0.18,0.56,...,0.17,0.11,0.04,0.07,0.16,2.37,0.0,0.0,0.0,10.95
beet_greens_boiled,3.7,7.86,0.29,4.18,38.88,0.0,0.86,0.0,0.0,0.0,...,0.12,0.11,0.06,0.09,0.11,3.8,0.0,0.0,0.0,551.09


###### Export nv_df and drv_df to csv

In [599]:
nv_df.to_csv('./Data/whfoods_nv.csv')
drv_df.to_csv('./Data/whfoods_drv.csv')

Note: can relate nv_df to drv_df for deriving drv in mass units

###### Merge constraints

In [600]:
constraints = pd.read_csv('constraints.csv')
aa_constraints = pd.read_csv('AminoAcids.csv')

In [601]:
constraints.columns, aa_constraints.columns

(Index(['nutrient', 'Min', 'Max', 'notes:'], dtype='object'),
 Index(['AminoAcid', 'Min'], dtype='object'))

In [602]:
# format, make compatible, and merge amino acid constraints with the rest.
constraints.rename(columns = lambda x: x.lower(), inplace = True)
aa_constraints.rename(columns = lambda x: x.lower(), inplace = True)

constraints.set_index('nutrient',inplace=True)
constraints.drop(columns = ['notes:'],inplace = True)
constraints.dropna(how = 'all', inplace = True)
aa_constraints.rename(columns = {'aminoacid':'nutrient'}, inplace = True)
aa_constraints.set_index('nutrient', inplace = True)
aa_constraints.index.name = None
all_constraints = pd.concat([constraints,aa_constraints],axis = 0)

###### Make constraints and nutrients compatible

In [603]:
nv_df['phenylalanine_tyrosine'] = nv_df['phenylalanine'] + nv_df['tyrosine']
nv_df['cysteine_methionine'] = nv_df['cysteine'] + nv_df['methionine']

In [604]:
constraints.loc['soluble_fiber'] = {'min':'6 g', 'max': np.nan}

In [605]:
nv_df.rename({'dietary_fiber':'fiber'}, axis=1, inplace=True)

In [606]:
all_constraints.rename({'zink':'zinc', 'total_fat':'fat_total',}, 
                       axis=0, inplace=True)

In [607]:
# Credit: Chat-gpt
def find_exact_matches(array1, array2):
    exact_matches = []
    for element1 in array1:
        match_found = any(element1 == element2 for element2 in array2)
        exact_matches.append(match_found)
    return exact_matches


In [608]:
# Credit: Chat-gpt
all_constraints.index.values[[bool(val) for val in 1- np.array(
    find_exact_matches(all_constraints.index.values, nv_df.columns.values))]]

array(['riboflavin', 'thiamin', 'vitamin_a_rae', 'vitamin_d', 'irom',
       'phosphorous', 'carbohydrate', 'fat', 'saturated_fatty_acids',
       'cystine + methionine', 'phenylalanine + tyrosine'], dtype=object)

In [609]:
all_constraints.rename(index={
    'total_fat': 'fat_total',
    'riboflavin':'vitamin_b2',
    'thiamin':'vitamin_b1',
    'vitamin_d':'vitamin_d_mcg',
    'vitamin_e':'vitamin_e_mg_alpha_tocopherol_equivalents_ate',
    'irom':'iron',
    'phosphorous':'phosphorus',
    'carbohydrate':'carbohydrates',
    'fat':'fat_total',
    'saturated_fatty_acids':'saturated_fat',
    'cystine + methionine':'cysteine_methionine',
    'phenylalanine + tyrosine':'phenylalanine_tyrosine',
    'zink':'zinc'
}, inplace=True)

In [610]:
# Credit: Chat-gpt
all_constraints.index.values[[bool(val) for val in 1- np.array(
    find_exact_matches(all_constraints.index.values, nv_df.columns.values))]]

array(['vitamin_a_rae', 'vitamin_e_mg_alpha_tocopherol_equivalents_ate'],
      dtype=object)

In [611]:
all_constraints.drop(index=['vitamin_a_rae', 'vitamin_e_mg_alpha_tocopherol_equivalents_ate'], inplace = True)

In [612]:
np.nan

nan

In [613]:
constraints = all_constraints.copy()
constraints_units_dict = {}
def getUnit(vals):
    '''Take tuple of strings being min and max vals with units or NaN values and 
    return a unit'''
    for val in vals:
        try:
            return val.split(' ')[1]
        except: 
            pass
    return None

def getNumber(val):
    try:
        return val.split(' ')[0]
    except:
        return val
    pass

constraints_units_dict = \
{index:getUnit((constraints.loc[index,'min'], constraints.loc[index,'max'])) 
 for index in constraints.index}
constraints['min'] = [getNumber(val) for val in constraints['min'].values ]
constraints['max'] = [getNumber(val) for val in constraints['max'].values ]

In [616]:
constraints_units_dict

{'calories': None,
 'protein': 'g',
 'fat_total': None,
 'saturated_fat': None,
 'cholesterol': 'g',
 'sodium': 'mg',
 'choline': 'mg',
 'folate': 'mcg',
 'niacin': 'mg',
 'pantothenic_acid': 'mg',
 'vitamin_b2': 'mg',
 'vitamin_b1': 'mg',
 'vitamin_a': 'mcg',
 'vitamin_b12': 'mcg',
 'vitamin_b6': 'mg',
 'vitamin_c': 'mg',
 'vitamin_d_mcg': 'mcg',
 'vitamin_k': 'mcg',
 'calcium': 'mg',
 'copper': 'mcg',
 'iron': 'mg',
 'magnesium': 'mg',
 'manganese': 'mg',
 'phosphorus': 'mg',
 'potassium': 'g',
 'selenium': 'mcg',
 'zinc': 'mg',
 'carbohydrates': 'g',
 'fiber': 'g',
 'molybdenum': 'mcg',
 'chloride': 'g',
 'biotin': 'mcg',
 'chromium': 'mcg',
 'cysteine_methionine': 'mg/kg/d',
 'histidine': 'mg/kg/d',
 'isoleucine': 'mg/kg/d',
 'leucine': 'mg/kg/d',
 'lysine': 'mg/kg/d',
 'methionine': 'mg/kg/d',
 'phenylalanine_tyrosine': 'mg/kg/d',
 'phenylalanine': 'mg/kg/d',
 'threonine': 'mg/kg/d',
 'tryptophan': 'mg/kg/d',
 'valine': 'mg/kg/d'}

In [617]:
constraints

Unnamed: 0,min,max
calories,2000.0,2000.0
protein,56.0,160.0
fat_total,22.2,78.0
saturated_fat,0.0,12.0
cholesterol,0.0,
sodium,1500.0,2300.0
choline,550.0,3500.0
folate,400.0,1000.0
niacin,16.0,35.0
pantothenic_acid,5.0,


In [622]:
nv_df_cnstr = nv_df[constraints.index.values]

In [630]:
nv_df_cnstr.to_csv('./Data_formatted/nv_df.csv')
constants.to_csv('')