# Setup

In [1]:
import json
import pandas as pd
import numpy as np
import os
import boto3

In [2]:
AWS_ACCESS_KEY_ID = os.environ["AWS_ACCESS_KEY_ID"]
AWS_SECRET_ACCESS_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]
AWS_DEFAULT_REGION = os.environ["AWS_DEFAULT_REGION"]

In [3]:
os.environ['AWS_DEFAULT_REGION']

'eu-west-3'

In [4]:
dynamoclient = boto3.client('dynamodb', region_name=AWS_DEFAULT_REGION,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
targettabname = "Recipes"

# Download data from DynamoDB -> Json

In [5]:
dynamopaginator = dynamoclient.get_paginator('scan')
dynamoresponse = dynamopaginator.paginate(
    TableName=targettabname,
    Select='ALL_ATTRIBUTES',
    ReturnConsumedCapacity='NONE',
    ConsistentRead=True
)
type(dynamoresponse)

botocore.paginate.PageIterator

In [15]:
for page in dynamoresponse:
    ds = page

In [14]:
with open("foo.json", "w") as f:
    json.dump(ds, f)

# Data exploration

In [17]:
ds.keys()

dict_keys(['Items', 'Count', 'ScannedCount', 'ResponseMetadata'])

In [21]:
ds = ds["Items"]

In [25]:
first_recipe = ds[0]

In [26]:
first_recipe.keys()

dict_keys(['date', 'nutriScore', 'id', 'data'])

In [30]:
first_recipe['data']['M'].keys()

dict_keys(['mealType', 'ingredients', 'title', 'numServings', 'cookingMethod', 'nutritionTable'])

In [55]:
first_recipe['data']['M']['ingredients']['L'][0]

{'M': {'hover': {'BOOL': False},
  'amount': {'S': '1'},
  'unit': {'S': 'serving'},
  'nutrition': {'M': {'kcal': {'M': {'name': {'S': 'Calories'},
      'amount': {'N': '146.4'},
      'unit': {'S': 'kcal'},
      'percentOfDailyNeeds': {'N': '7.32'}}},
    'sodium': {'M': {'name': {'S': 'Sodium'},
      'amount': {'N': '92.72'},
      'unit': {'S': 'mg'},
      'percentOfDailyNeeds': {'N': '4.03'}}},
    'sugars': {'M': {'name': {'S': 'Sugar'},
      'amount': {'N': '11.74'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '13.04'}}},
    'carbs': {'M': {'name': {'S': 'Net Carbohydrates'},
      'amount': {'N': '11.39'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '4.14'}}},
    'protein': {'M': {'name': {'S': 'Protein'},
      'amount': {'N': '8'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '16.01'}}},
    'fat': {'M': {'name': {'S': 'Fat'},
      'amount': {'N': '7.81'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '1

In [28]:
first_recipe["data"]['M']["nutritionTable"]['M']

{'kcal': {'M': {'percentOfDailyNeeds': {'N': '17.82'},
   'amount': {'N': '356.4'},
   'unit': {'S': ''}}},
 'sodium': {'M': {'percentOfDailyNeeds': {'N': '6.12'},
   'amount': {'N': '140.72'},
   'unit': {'S': 'mg'}}},
 'sugars': {'M': {'percentOfDailyNeeds': {'N': '28.82'},
   'amount': {'N': '25.94'},
   'unit': {'S': 'g'}}},
 'carbs': {'M': {'percentOfDailyNeeds': {'N': '12.58'},
   'amount': {'N': '34.59'},
   'unit': {'S': 'g'}}},
 'protein': {'M': {'percentOfDailyNeeds': {'N': '68.43'},
   'amount': {'N': '34.21'},
   'unit': {'S': 'g'}}},
 'fat': {'M': {'percentOfDailyNeeds': {'N': '15.63'},
   'amount': {'N': '10.16'},
   'unit': {'S': 'g'}}},
 'saturates': {'M': {'percentOfDailyNeeds': {'N': '32.21'},
   'amount': {'N': '5.15'},
   'unit': {'S': 'g'}}},
 'fibre': {'M': {'percentOfDailyNeeds': {'N': '10.4'},
   'amount': {'N': '2.6'},
   'unit': {'S': 'g'}}}}

In [39]:
fibre_dict = first_recipe["data"]['M']["nutritionTable"]['M']['fibre']

# Transform Recipes.json -> csv

recipe_model = {
    Id: number,
    title: string,
    kcal: float,
    sodium: float,
    sugars: float,
    carbs: float,
    protein: float,
    fat: float,
    saturates: float,
    fibre: float,
    nutriScore: float,
}

In [51]:
def convert(amount: float, unit_ratio: dict = {"mg": 0.001, "g": 1}, unit_from: str = 'mg') -> float:
    """
    Convert an amount from unit_from to unit_to. Unit ratio is a dict that shows how much of the key is equal to 1g.
    For example, "mg": 0.001 means 1 mg = 0.001g.
    """
    try:
        to_amount = amount * unit_ratio[unit_from]
    except KeyError:
        print("Units don't exist in the ration dict")
        return None
    return to_amount
def extract_reqFields(data: dict, req_fields: list) -> dict:
    """Read in a nutrition dictionary follow format returned by AWS DynamoDB API. Then return the needed fields in appropriate type.
    Symbols that 
    Example: 
    {'M': {'percentOfDailyNeeds': {'N': '10.4'},
      'amount': {'N': '2.6'},
      'unit': {'S': 'g'}}}
    Supported types: 'S', 'N', 'M', 'BOOL'
    """
    result = {}
    for field in req_fields:
        field_data = data[field]
        field_keys = field_data.keys()
        for key in field_keys:
            if key in set(['M', 'S', 'BOOL']):
                result[field] = field_data[key]
            elif key == 'N':
                result[field] = float(field_data[key])
            else:
                print(f"Type of {field} field not supported: {key}")
                return None
    return result

In [52]:
extract_reqFields(fibre_dict['M'], req_fields= ["amount", "unit"])

{'amount': 2.6, 'unit': 'g'}

In [77]:
def preprocess_nutrition(nutri_table: dict, req_nutri: list) -> list:
    """
    Preprocess nutritions of a recipe, convert to g, and adding to a list.
    The input req_nutri provides the list of required nutritions. The returned list will follow the order provided in req_nutri.
    Example: ["kcal" ,"sodium", "sugars", "carbs", "protein", "fat", "saturates", "fibre"].
    """
    result = []
    for nutri in req_nutri:
        nutri_detail = extract_reqFields(nutri_table[nutri]['M'], req_fields = ["amount", "unit"])
        org_amount = nutri_detail["amount"]
        transformed_amount = convert(amount = org_amount, unit_ratio = {'': 1, "mg": 0.001, "g": 1}, unit_from = nutri_detail["unit"])
        result.append(transformed_amount)
    return result

In [86]:
def preprocess_recipe(recipe: dict, req_nutri: list) -> list:
    """
    Preprocess a recipe and put relevant data into a list.
    Recipe is a dict with the following keys: ['date', 'nutriScore', 'id', 'data']
    The returned list follows the Recipes table modelling.
    """
    result = []
    recipe_id = int(recipe["id"]["N"]); result.append(recipe_id)
    recipe_score = float(recipe["nutriScore"]["N"]); result.append(recipe_score)
    nutri_table = recipe["data"]["M"]["nutritionTable"]['M']
    result.extend(preprocess_nutrition(nutri_table, req_nutri = req_nutri))
    return result

In [78]:
preprocess_nutrition(first_recipe["data"]['M']["nutritionTable"]['M'], ["kcal" ,"sodium", "sugars", "carbs", "protein", "fat", "saturates", "fibre"])

[356.4, 0.14072, 25.94, 34.59, 34.21, 10.16, 5.15, 2.6]

In [87]:
nutrition_list =  ["kcal" ,"sodium", "sugars", "carbs", "protein", "fat", "saturates", "fibre"]
allRecipes_inf = []
for recipe in ds:
    allRecipes_inf.append(preprocess_recipe(recipe, req_nutri = nutrition_list))
allRecipes_inf[0]

[-297529017418066210,
 60.0,
 356.4,
 0.14072,
 25.94,
 34.59,
 34.21,
 10.16,
 5.15,
 2.6]

In [88]:
len(allRecipes_inf)

8

In [89]:
df_cols = ["id", "nutriScore", "kcal" ,"sodium", "sugars", "carbs", "protein", "fat", "saturates", "fibre"]
recipe_df = pd.DataFrame(allRecipes_inf, columns=df_cols)
recipe_df

Unnamed: 0,id,nutriScore,kcal,sodium,sugars,carbs,protein,fat,saturates,fibre
0,-297529017418066210,60.0,356.4,0.14072,25.94,34.59,34.21,10.16,5.15,2.6
1,2984524080050943643,69.09,368.5,1.55038,10.12,40.6,26.65,8.26,6.1,7.54
2,-134860378839050690,69.09,368.5,1.55038,10.12,40.6,26.65,8.26,6.1,7.54
3,-6989815825829559279,81.82,505.25,0.28775,6.25,15.25,54.5,21.5,3.25,8.75
4,1662862941331579196,72.73,115.0,0.002,23.0,25.0,0.0,0.0,0.0,5.0
5,6525688955145446406,80.0,58.0,0.001,0.0,0.0,1.0,3.0,0.0,4.0
6,-7950659694363478456,61.82,397.0,1.428,0.0,1.0,28.0,29.0,10.0,0.0
7,6646347900262682700,80.0,58.32,0.00192,0.0,0.93,1.98,3.69,0.4,4.13


In [91]:
recipe_df.to_csv("recipes.csv", index = False)

# Upload csv to S3