In [None]:
import pandas as pd
import numpy as np
from io import StringIO
import requests
import os
import json
import boto3
import logging
from botocore.exceptions import ClientError
from datetime import datetime, timedelta

import psycopg2
import psycopg2.extras as extras
from sqlalchemy import create_engine

ipaddress = os.environ['ipaddress']
dbname = os.environ['dbname']
username = os.environ['username']
password = os.environ['password']
port = os.environ['port']

def lambda_handler(event, context):
    
    tags = ['ethiopian', 'hawaiian', 'lebanese', 'swedish', 'persian', 'west_african', 
        'indigenous', 'laotian', 'venezuelan', 'kenyan', 'peruvian', 'latin_american', 'brazilian',
        'korean', 'japanese', 'german', 'haitian', 'taiwanese', 'filipino', 'south_african',
        'jamaican', 'american', 'bbq', 'chinese', 'french', 'caribbean', 'vietnamese', 'fusion',
        'cuban', 'african', 'british', 'thai', 'puerto_rican', 'dominican', 'greek', 'indian',
        'seafood', 'middle_eastern', 'mexican', 'italian', 'soul_food']
        
    y = {}
    for t in tags:
        y[t] = get_data(t)
        y[t]["tag"] = t

    #Eine Liste aller Dataframes erstellen
    listOfDataframes = list(y.keys())
    
    # Der Loop geht über alle Keys des Dictionary und fügt die Values (Dataframes) zusammen
    listOfKeys = []
    for i in listOfDataframes:
        listOfKeys.append(y[i])
    df_total = pd.concat(listOfKeys)
    
    df = creat_subset(df_total)
    
    df = clean_data(df)
    
    # create new df for upload in DB without dictionaries
    df_db = df[['name', 'original_video_url','keywords','num_servings','total_time_minutes','yields', 'country',
               'id', 'prep_time_minutes', 'description', 'cook_time_minutes', 'nutrition_fiber', 'nutrition_protein',
               'nutrition_fat', 'nutrition_calories', 'nutrition_sugar', 'nutrition_carbohydrates',
               'user_ratings_count_positive', 'user_ratings_score', 'user_ratings_count_negative',
               'total_time_tier_display_tier', 'topics_clean', 'tags_clean', 'credits_clean', 'instructions_clean',
               'ingredients']]
               
    # add cleaned data to DB
    # A long string that contains the necessary Postgres login information
    postgres_str = f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
    
    # Create the connection
    cnx = create_engine(postgres_str)
    
    df_db.to_sql('tasty_clean', con=cnx, index=False, if_exists='append')

    
    # connection for S3
    #s3 = boto3.client('s3')
    #bucket ='tasty-datalake-bucket'
    #csvFile = 'tasty_data.csv'

    #csv_buffer = StringIO()
    #df.to_csv(csv_buffer)

    #s3_resource = boto3.resource('s3')
    #s3_resource.Object(bucket, csvFile).put(Body=csv_buffer.getvalue())
    
    print("Data was successfully loaded to DB.")
    

def get_data(tag):
    
    # credentials for Tasty
    headers = {
    'x-rapidapi-host': "tasty.p.rapidapi.com",
    'x-rapidapi-key': "362fc9c239mshcfe50bd2bfb56f4p1ca4e5jsncbaacdf9bb2a"
    }
    
    # query to select data from API
    url = "https://tasty.p.rapidapi.com/recipes/list"

    querystring = {"from":"0","size":"500","tags":tag}

    response = requests.request("GET", url, headers=headers, params=querystring)
    results = response.json()['results']
    results = pd.json_normalize(results)
    
    # create data frame
    df_recipes = pd.DataFrame(results)
    
    return df_recipes

    
def creat_subset(df):
    
    df = df[df['instructions'].notna()]
    
    # create subset
    df = df[['name',
     'original_video_url',
     'topics',
     'keywords',
     'tags',
     'num_servings',
     'total_time_minutes',
     'yields',
     'country',
     'tips_and_ratings_enabled',
     'aspect_ratio',
     'credits',
     'sections',
     'instructions',
     'id',
     'prep_time_minutes',
     'description',
     'cook_time_minutes',
     'nutrition.fiber',
     'nutrition.protein',
     'nutrition.fat',
     'nutrition.calories',
     'nutrition.sugar',
     'nutrition.carbohydrates',
     'user_ratings.count_positive',
     'user_ratings.score',
     'user_ratings.count_negative',
     'total_time_tier.display_tier']]
     
    df = df.rename(columns={'nutrition.fiber': 'nutrition_fiber', 'nutrition.protein': 'nutrition_protein', 'nutrition.fat': 'nutrition_fat', \
        'nutrition.calories': 'nutrition_calories', 'nutrition.sugar': 'nutrition_sugar', 'nutrition.carbohydrates': 'nutrition_carbohydrates', \
        'user_ratings.count_positive': 'user_ratings_count_positive', 'user_ratings.score': 'user_ratings_score', 'user_ratings.count_negative': 'user_ratings_count_negative', \
        'total_time_tier.display_tier': 'total_time_tier_display_tier'})
    
    return df
    
# clean data
def clean_data(df):
    df = df.reset_index()
        
    df.dropna(subset=['topics'], inplace=True)
    df.dropna(subset=['tags'], inplace=True)
    df.dropna(subset=['credits'], inplace=True)
    df.dropna(subset=['instructions'], inplace=True)
    df.dropna(subset=['sections'], inplace=True)
        
    df = df.drop_duplicates(subset=['name'])
        
    # Create new list for topics
    topics_clean = []
        
    # For each row in df.topics,
    for row in df['topics']:
        l = [d['name'] for d in row if 'name' in d]
        topics_clean.append(l)
        
    # Assign clean topics to df
    df['topics_clean'] = topics_clean
        
    # Create new list for tags
    tags_clean = []
        
    # For each row in df.topics,
    for row in df['tags']:
        l = [d['name'] for d in row if 'name' in d]
        tags_clean.append(l)
        
    # Assign clean tags to df
    df['tags_clean'] = tags_clean
        
    # Create new list for credits
    credits_clean = []
        
    # For each row in df.topics,
    for row in df['credits']:
        l = [d['type'] for d in row if 'type' in d]
        credits_clean.append(l)
        
    # Assign clean credits to df
    df['credits_clean'] = credits_clean
        
    # clean ingredients
    d = df['sections']
    ing_list = []
    for s in d:
        for jj in s:
            lst4 = []
            for ee in jj['components']:
                lst4.append(ee['ingredient']['name'])
        ing_list.append(lst4)
    df['ingredients'] = ing_list
        
    # Create a variable
    instructions_clean = []
        
    # For each row in df.topics,
    for row in df['instructions']:
        l = [d['display_text'] for d in row if 'display_text' in d]
        instructions_clean.append(l)
        
    # Assign clean topics to df
    df['instructions_clean'] = instructions_clean
        
    # Convert set to list
    df['credits_clean'] = df['credits_clean'].apply(lambda x: list(x))
        
    # change name to lowercase
    df['name'] = df['name'].str.lower()
    
    return df