# Setup

In [1]:
import json
import pandas as pd
import numpy as np
import os
import boto3

In [7]:
AWS_ACCESS_KEY_ID = os.environ["AWS_ACCESS_KEY_ID"]
AWS_SECRET_ACCESS_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]
AWS_DEFAULT_REGION = os.environ["AWS_DEFAULT_REGION"]

In [11]:
s3_client = boto3.client('s3', region_name=AWS_DEFAULT_REGION,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
dynamo_client = boto3.client('dynamodb', region_name=AWS_DEFAULT_REGION,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
targettabname = "Recipes"
bucket_name = "toeat-mlbucket"
file_name = "recipes.csv"

In [12]:
dynamo_client.describe_table(TableName='Recipes')

{'Table': {'AttributeDefinitions': [{'AttributeName': 'id',
    'AttributeType': 'N'}],
  'TableName': 'Recipes',
  'KeySchema': [{'AttributeName': 'id', 'KeyType': 'HASH'}],
  'TableStatus': 'ACTIVE',
  'CreationDateTime': datetime.datetime(2022, 8, 11, 17, 58, 43, 579000, tzinfo=tzlocal()),
  'ProvisionedThroughput': {'LastDecreaseDateTime': datetime.datetime(2022, 10, 2, 15, 40, 9, 572000, tzinfo=tzlocal()),
   'NumberOfDecreasesToday': 0,
   'ReadCapacityUnits': 1,
   'WriteCapacityUnits': 1},
  'TableSizeBytes': 82844,
  'ItemCount': 22,
  'TableArn': 'arn:aws:dynamodb:eu-central-1:382406425689:table/Recipes',
  'TableId': '3500a6df-73c6-4066-bbce-35f4447eb585'},
 'ResponseMetadata': {'RequestId': 'VNAPEJKQF45G6A5VRHH9T2BLI3VV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Sat, 07 Jan 2023 15:21:41 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '509',
   'connection': 'keep-alive',
   'x-amzn-reques

# Download data from DynamoDB -> Json

In [6]:
dynamopaginator = dynamo_client.get_paginator('scan')
dynamoresponse = dynamopaginator.paginate(
    TableName=targettabname,
    Select='ALL_ATTRIBUTES',
    ReturnConsumedCapacity='NONE',
    ConsistentRead=True
)
type(dynamoresponse)

botocore.paginate.PageIterator

In [7]:
with open("Recipes.json", "w") as f:
    for page in dynamoresponse:
        ds = page
        json.dump(ds, f)

# Data exploration

In [17]:
ds.keys()

dict_keys(['Items', 'Count', 'ScannedCount', 'ResponseMetadata'])

In [25]:
first_recipe = ds[0]

In [26]:
first_recipe.keys()

dict_keys(['date', 'nutriScore', 'id', 'data'])

In [30]:
first_recipe['data']['M'].keys()

dict_keys(['mealType', 'ingredients', 'title', 'numServings', 'cookingMethod', 'nutritionTable'])

In [55]:
first_recipe['data']['M']['ingredients']['L'][0]

{'M': {'hover': {'BOOL': False},
  'amount': {'S': '1'},
  'unit': {'S': 'serving'},
  'nutrition': {'M': {'kcal': {'M': {'name': {'S': 'Calories'},
      'amount': {'N': '146.4'},
      'unit': {'S': 'kcal'},
      'percentOfDailyNeeds': {'N': '7.32'}}},
    'sodium': {'M': {'name': {'S': 'Sodium'},
      'amount': {'N': '92.72'},
      'unit': {'S': 'mg'},
      'percentOfDailyNeeds': {'N': '4.03'}}},
    'sugars': {'M': {'name': {'S': 'Sugar'},
      'amount': {'N': '11.74'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '13.04'}}},
    'carbs': {'M': {'name': {'S': 'Net Carbohydrates'},
      'amount': {'N': '11.39'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '4.14'}}},
    'protein': {'M': {'name': {'S': 'Protein'},
      'amount': {'N': '8'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '16.01'}}},
    'fat': {'M': {'name': {'S': 'Fat'},
      'amount': {'N': '7.81'},
      'unit': {'S': 'g'},
      'percentOfDailyNeeds': {'N': '1

In [28]:
first_recipe["data"]['M']["nutritionTable"]['M']

{'kcal': {'M': {'percentOfDailyNeeds': {'N': '17.82'},
   'amount': {'N': '356.4'},
   'unit': {'S': ''}}},
 'sodium': {'M': {'percentOfDailyNeeds': {'N': '6.12'},
   'amount': {'N': '140.72'},
   'unit': {'S': 'mg'}}},
 'sugars': {'M': {'percentOfDailyNeeds': {'N': '28.82'},
   'amount': {'N': '25.94'},
   'unit': {'S': 'g'}}},
 'carbs': {'M': {'percentOfDailyNeeds': {'N': '12.58'},
   'amount': {'N': '34.59'},
   'unit': {'S': 'g'}}},
 'protein': {'M': {'percentOfDailyNeeds': {'N': '68.43'},
   'amount': {'N': '34.21'},
   'unit': {'S': 'g'}}},
 'fat': {'M': {'percentOfDailyNeeds': {'N': '15.63'},
   'amount': {'N': '10.16'},
   'unit': {'S': 'g'}}},
 'saturates': {'M': {'percentOfDailyNeeds': {'N': '32.21'},
   'amount': {'N': '5.15'},
   'unit': {'S': 'g'}}},
 'fibre': {'M': {'percentOfDailyNeeds': {'N': '10.4'},
   'amount': {'N': '2.6'},
   'unit': {'S': 'g'}}}}

In [39]:
fibre_dict = first_recipe["data"]['M']["nutritionTable"]['M']['fibre']

# Transform Recipes.json -> csv

recipe_model = {
    Id: number,
    title: string,
    nutriScore: float,
    mealType: string,
    kcal: float,
    sodium: float,
    sugars: float,
    carbs: float,
    protein: float,
    fat: float,
    saturates: float,
    fibre: float,
}

In [8]:
def convert(amount: float, unit_ratio: dict = {"mg": 0.001, "g": 1}, unit_from: str = 'mg') -> float:
    """
    Convert an amount from unit_from to unit_to. Unit ratio is a dict that shows how much of the key is equal to 1g.
    For example, "mg": 0.001 means 1 mg = 0.001g.
    """
    try:
        to_amount = amount * unit_ratio[unit_from]
    except KeyError:
        print("Units don't exist in the ration dict")
        return None
    return to_amount
def extract_reqFields(data: dict, req_fields: list) -> dict:
    """Read in a nutrition dictionary follow format returned by AWS DynamoDB API. Then return the needed fields in appropriate type.
    Symbols that 
    Example: 
    {'M': {'percentOfDailyNeeds': {'N': '10.4'},
      'amount': {'N': '2.6'},
      'unit': {'S': 'g'}}}
    Supported types: 'S', 'N', 'M', 'BOOL'
    """
    result = {}
    for field in req_fields:
        field_data = data[field]
        field_keys = field_data.keys()
        for key in field_keys:
            if key in set(['M', 'S', 'BOOL']):
                result[field] = field_data[key]
            elif key == 'N':
                result[field] = float(field_data[key])
            else:
                print(f"Type of {field} field not supported: {key}")
                return None
    return result

In [9]:
def preprocess_nutrition(nutri_table: dict, req_nutri: list) -> list:
    """
    Preprocess nutritions of a recipe, convert to g, and adding to a list.
    The input req_nutri provides the list of required nutritions. The returned list will follow the order provided in req_nutri.
    Example: ["kcal" ,"sodium", "sugars", "carbs", "protein", "fat", "saturates", "fibre"].
    """
    result = []
    for nutri in req_nutri:
        nutri_detail = extract_reqFields(nutri_table[nutri]['M'], req_fields = ["amount", "unit"])
        org_amount = nutri_detail["amount"]
        transformed_amount = convert(amount = org_amount, unit_ratio = {'': 1, "mg": 0.001, "g": 1}, unit_from = nutri_detail["unit"])
        result.append(transformed_amount)
    return result

In [10]:
def preprocess_recipe(recipe: dict, req_nutri: list) -> list:
    """
    Preprocess a recipe and put relevant data into a list.
    Recipe is a dict with the following keys: ['date', 'nutriScore', 'id', 'data']
    The returned list follows the Recipes table modelling.
    """
    result = []
    recipe_data = recipe["data"]["M"]
    recipe_id = int(recipe["id"]["N"])
    recipe_title = recipe_data["title"]["S"]
    recipe_score = float(recipe["nutriScore"]["N"])
    recipe_mealType = recipe_data["mealType"]["S"]
    result.extend([recipe_id, recipe_title, recipe_score, recipe_mealType])
    nutri_table = recipe_data["nutritionTable"]['M']
    result.extend(preprocess_nutrition(nutri_table, req_nutri = req_nutri))
    return result

In [11]:
ds = ds["Items"]

In [12]:
nutrition_list =  ["kcal" ,"sodium", "sugars", "carbs", "protein", "fat", "saturates", "fibre"]
allRecipes_inf = []
for recipe in ds:
    allRecipes_inf.append(preprocess_recipe(recipe, req_nutri = nutrition_list))
allRecipes_inf[0]

[-7350875131303561697,
 'Tuna Pasta Salad',
 81.82,
 'Breakfast',
 486.67,
 0.20700000000000002,
 5.0,
 54.67,
 24.67,
 16.0,
 1.67,
 3.33]

In [13]:
df_cols = ["id", "title", "nutriScore", "mealType", "kcal" ,"sodium", "sugars", "carbs", "protein", "fat", "saturates", "fibre"]
recipe_df = pd.DataFrame(allRecipes_inf, columns=df_cols)
recipe_df

Unnamed: 0,id,title,nutriScore,mealType,kcal,sodium,sugars,carbs,protein,fat,saturates,fibre
0,-7350875131303561697,Tuna Pasta Salad,81.82,Breakfast,486.67,0.207,5.0,54.67,24.67,16.0,1.67,3.33
1,-3720415708234728595,Air Fryer Hot Dogs,74.55,Lunch,222.75,0.50225,2.5,25.5,8.75,9.0,2.75,1.5
2,4730887800733880567,Steak and potato,80.0,Dinner,710.0,0.38,3.0,49.0,50.0,31.0,13.0,6.0
3,-8968252886510129462,Crocket,74.55,Snack,242.0,0.055,35.0,38.0,15.0,0.0,0.0,3.0
4,-5195237535349084959,Broccoli Cheese Soup,70.91,Lunch,141.0,0.21733,1.75,9.33,2.5,9.92,6.08,0.5
5,-6689477957593181560,Stampot,63.64,Dinner,1395.0,3.236,3.0,65.0,77.0,82.0,28.0,12.0
6,1838749280961719950,Test Minh,63.64,Lunch,115.0,0.01,24.0,27.0,0.0,0.0,0.0,0.0
7,-3622805872274632641,Almond butter,49.09,Snack,2231.8,3.2172,47.0,306.8,85.2,60.6,6.8,34.6
8,1126925830304381983,Sear tuna and potatoes,69.09,Dinner,368.5,1.55038,10.12,40.6,26.65,8.26,6.1,7.54
9,4080139527318834029,Seared tuna steak & sweet potato wedges,76.36,Dinner,351.5,0.4435,5.0,22.0,34.0,11.5,8.5,3.5


In [14]:
recipe_df.to_csv("recipes.csv", index = False)

# Upload csv to S3

In [14]:
s3_client.upload_file("recipes.csv", bucket_name, file_name)