In [2]:
import pandas as pd
import numpy as np

In [3]:
df_codes = pd.read_csv('data/nutrition_codes.csv')
df_codes.head()

Unnamed: 0,Numeric Code,GDD Variable Label
0,v01,Fruits
1,v02,Non-starchy vegetables
2,v03,Potatoes
3,v04,Other starchy vegetables
4,v05,Beans and legumes


In [4]:
df_codes.rename({'GDD Variable Label': 'label', 'Numeric Code': 'value'}, axis=1, inplace=True)
df_codes.head()

Unnamed: 0,value,label
0,v01,Fruits
1,v02,Non-starchy vegetables
2,v03,Potatoes
3,v04,Other starchy vegetables
4,v05,Beans and legumes


In [5]:
df_codes

Unnamed: 0,value,label
0,v01,Fruits
1,v02,Non-starchy vegetables
2,v03,Potatoes
3,v04,Other starchy vegetables
4,v05,Beans and legumes
5,v06,Nuts and seeds
6,v07,Refined grains
7,v08,Whole grains
8,v09,Total processed meats
9,v10,Unprocessed red meats


In [6]:
df_codes['label'].unique()

array(['Fruits', 'Non-starchy vegetables', 'Potatoes',
       'Other starchy vegetables', 'Beans and legumes', 'Nuts and seeds',
       'Refined grains', 'Whole grains', 'Total processed meats',
       'Unprocessed red meats', 'Total seafoods', 'Eggs', 'Cheese',
       'Yoghurt (including fermented milk)', 'Sugar-sweetened beverages',
       'Fruit juices', 'Coffee', 'Tea', 'Total carbohydrates',
       'Total protein', 'Saturated fat', 'Monounsaturated fatty acids',
       'Total omega-6 fat', 'Seafood omega-3 fat', 'Plant omega-3 fat',
       'Dietary cholesterol', 'Dietary fiber', 'Added sugars', 'Calcium',
       'Dietary sodium', 'Iodine', 'Iron', 'Magnesium', 'Potassium',
       'Selenium', 'Vitamin A w/ supplements', 'Vitamin B1', 'Vitamin B2',
       'Vitamin B3', 'Vitamin B6', 'Vitamin B9 (Folate)', 'Vitamin B12',
       'Vitamin C', 'Vitamin D', 'Vitamin E', 'Zinc', 'Total Milk'],
      dtype=object)

In [7]:
df_codes['label'].nunique()

47

In [8]:
df_serving_size = pd.read_csv('data/nutrition_serving_sizes.csv')
df_serving_size.head()

Unnamed: 0,GDD Variable Label,GDD Variable Unit
0,Added sugars,% of total kcal per day (energy contribution)
1,Beans and legumes,grams per day
2,Calcium,milligrams (mg) per day
3,Cheese,grams per day
4,Coffee,cups/day (1 cup=8 oz)


In [9]:
df_serving_size.rename({'GDD Variable Label': 'label', 'GDD Variable Unit': 'serving_size'}, axis=1, inplace=True)
df_serving_size.head()

Unnamed: 0,label,serving_size
0,Added sugars,% of total kcal per day (energy contribution)
1,Beans and legumes,grams per day
2,Calcium,milligrams (mg) per day
3,Cheese,grams per day
4,Coffee,cups/day (1 cup=8 oz)


In [10]:
df_serving_size['label'].unique()

array(['Added sugars', 'Beans and legumes', 'Calcium', 'Cheese', 'Coffee',
       'Dietary cholesterol', 'Dietary fiber', 'Dietary sodium', 'Eggs',
       'Fruit juices', 'Fruits', 'Iodine', 'Iron', 'Magnesium',
       'Monounsaturated fat', 'Non-starchy vegetables', 'Nuts and seeds',
       'Other starchy vegetables', 'Plant omega-3 (n-3) fat',
       'Plant protein', 'Potassium', 'Potatoes', 'Reduced fat milk',
       'Refined grains', 'Saturated fat', 'Seafood omega-3 (n-3) fat',
       'Selenium', 'Sugar-sweetened beverages', 'Tea',
       'Total animal protein', 'Total carbohydrates', 'Total energy',
       'Total Milk', 'Total omega-6 fatty acids', 'Total processed meats',
       'Total protein', 'Total seafoods', 'Unprocessed red meats',
       'Vitamin A with supplements', 'Vitamin A without supplements',
       'Vitamin B1', 'Vitamin B12', 'Vitamin B2', 'Vitamin B3',
       'Vitamin B6', 'Vitamin B9 (Folate)', 'Vitamin C', 'Vitamin D',
       'Vitamin E', 'Whole fat milk', 'Wh

In [11]:
df_serving_size['label'].nunique()

53

##### Find differences in the variable names between the two datasets

In [12]:
# get unique foods in both datasets
unique_codes_list = df_codes['label'].unique()
print('\nUnique unique_codes_list:', len(unique_codes_list))
unique_serving_sizes = df_serving_size['label'].unique()
print('Unique unique_serving_sizes:', len(unique_serving_sizes))

# find the foods not in df_codes
difference_1 = list(set(unique_serving_sizes) - set(unique_codes_list))
print('\nfoods not in df_codes', len(difference_1), ':', difference_1)

# find the foods not in df_serving_size
difference_2 = list(set(unique_codes_list) - set(unique_serving_sizes))
print('\nfoods not in df_serving_size', len(difference_2), ':', difference_2)


Unique unique_codes_list: 47
Unique unique_serving_sizes: 53

foods not in df_codes 11 : ['Vitamin A without supplements', 'Whole fat milk', 'Seafood omega-3 (n-3) fat', 'Plant protein', 'Monounsaturated fat', 'Vitamin A with supplements', 'Plant omega-3 (n-3) fat', 'Total omega-6 fatty acids', 'Reduced fat milk', 'Total energy', 'Total animal protein']

foods not in df_serving_size 5 : ['Seafood omega-3 fat', 'Vitamin A w/ supplements', 'Monounsaturated fatty acids', 'Total omega-6 fat', 'Plant omega-3 fat']


In [13]:
print('Total omega-6 fatty acids' in df_codes['label'].values)

False


##### Create dictionary to fix differences in variable labels

In [14]:
values_names = [
    {
        "code": "Total omega-6 fatty acids",
        "serving": "Total omega-6 fat"
    },
    {
        "code": "Plant omega-3 (n-3) fat",
        "serving": "Plant omega-3 fat"
    },
    {
        "code": "Vitamin A without supplements",
        "serving": "Vitamin A w/ supplements"
    },
    {
        "code": "Seafood omega-3 (n-3) fat",
        "serving": "Seafood omega-3 fat"
    },
    {
        "code": "Monounsaturated fat",
        "serving": "Monounsaturated fatty acids"
    }
]

##### Reassing names in the df_serving_size dataframe

In [15]:
for name in values_names:
  df_serving_size['label'] = np.where(df_serving_size['label'] == name['serving'], name['code'], df_serving_size['label'])

df_serving_size.head()



Unnamed: 0,label,serving_size
0,Added sugars,% of total kcal per day (energy contribution)
1,Beans and legumes,grams per day
2,Calcium,milligrams (mg) per day
3,Cheese,grams per day
4,Coffee,cups/day (1 cup=8 oz)


### Merge datasets

In [16]:
data = pd.merge(df_codes, df_serving_size, on="label")

In [17]:
data

Unnamed: 0,value,label,serving_size
0,v01,Fruits,grams per day
1,v02,Non-starchy vegetables,grams per day
2,v03,Potatoes,grams per day
3,v04,Other starchy vegetables,grams per day
4,v05,Beans and legumes,grams per day
5,v06,Nuts and seeds,grams per day
6,v07,Refined grains,grams per day
7,v08,Whole grains,grams per day
8,v09,Total processed meats,grams per day
9,v10,Unprocessed red meats,grams per day


In [18]:
food_groups = [
    {
        "category": "Foods",
        "values": ['Fruits', 'Non-starchy vegetables', 'Potatoes', 'Other starchy vegetables', 'Beans and legumes', 
                   'Nuts and seeds', 'Refined grains', 'Whole grains', 'Unprocessed red meats', 'Total processed meats', 
                  'Total seafoods', 'Eggs', 'Cheese', 'Yoghurt (including fermented milk)']
    }, 
    {
        "category": "Beverages",
        "values": ['Sugar-sweetened beverages', 'Fruit juices', 'Coffee', 'Tea', 'Whole Fat Milk', 'Reduced Fat Milk',
                  'Total Milk']
    },
    {
        "category": "Macronutrients",
        "values": ['Total carbohydrates', 'Total protein', 'Total Animal Protein', 'Plant Protein', 'Saturated fat', 
                  'Monounsaturated Fat', 'Total Omega-6 Fatty Acids', 'Seafood Omega-3 Fatty Acids', 'Plant Omega-3 Fatty Acids',
                  'Dietary cholesterol', 'Dietary fiber', 'Added sugars']
    },
    {
        "category": "Micronutrients",
        "values": ['Calcium', 'Dietary sodium', 'Iodine', 'Iron', 'Magnesium', 'Potassium', 'Selenium', 
                   'Vitamin A w/ supplements', 'Vitamin B1', 'Vitamin B2', 'Vitamin B3', 'Vitamin B6', 'Vitamin B9 (Folate)',
                  'Vitamin B12', 'Vitamin C', 'Vitamin D', 'Vitamin E', 'Zinc']
    }
]

In [19]:
data.head()
data['food_group'] = ''
data.head()

Unnamed: 0,value,label,serving_size,food_group
0,v01,Fruits,grams per day,
1,v02,Non-starchy vegetables,grams per day,
2,v03,Potatoes,grams per day,
3,v04,Other starchy vegetables,grams per day,
4,v05,Beans and legumes,grams per day,


In [20]:
for group in food_groups:
#     print(group['category'], group['values'])
    for index, row in data.iterrows():
#         print(row['label'])
        if row['label'] in group['values']:
            row['food_group'] = group['category']
#     for nutrient in group['values']:
#         print(nutrient)
data

Unnamed: 0,value,label,serving_size,food_group
0,v01,Fruits,grams per day,Foods
1,v02,Non-starchy vegetables,grams per day,Foods
2,v03,Potatoes,grams per day,Foods
3,v04,Other starchy vegetables,grams per day,Foods
4,v05,Beans and legumes,grams per day,Foods
5,v06,Nuts and seeds,grams per day,Foods
6,v07,Refined grains,grams per day,Foods
7,v08,Whole grains,grams per day,Foods
8,v09,Total processed meats,grams per day,Foods
9,v10,Unprocessed red meats,grams per day,Foods


In [21]:
for index, row in data.iterrows():
    row['value'] = row['value'].replace('v', '')
    
data['value'] = data['value'].astype(int)


In [22]:
data

Unnamed: 0,value,label,serving_size,food_group
0,1,Fruits,grams per day,Foods
1,2,Non-starchy vegetables,grams per day,Foods
2,3,Potatoes,grams per day,Foods
3,4,Other starchy vegetables,grams per day,Foods
4,5,Beans and legumes,grams per day,Foods
5,6,Nuts and seeds,grams per day,Foods
6,7,Refined grains,grams per day,Foods
7,8,Whole grains,grams per day,Foods
8,9,Total processed meats,grams per day,Foods
9,10,Unprocessed red meats,grams per day,Foods


In [23]:
data.shape

(42, 4)

In [24]:
# save as json
# data.to_json('final_datasets/nutrition_filters.json', orient="records")