In [None]:
import psycopg2
import psycopg2.extras as extras
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from io import StringIO
import requests
import os
import json

ipaddress = os.environ['ipaddress']
dbname = os.environ['dbname']
username = os.environ['username']
password = os.environ['password']
port = os.environ['port']

def lambda_handler(event, context):
    
    tags = ['ethiopian', 'hawaiian', 'lebanese', 'swedish', 'persian', 'west_african', 
        'indigenous', 'laotian', 'venezuelan', 'kenyan', 'peruvian', 'latin_american', 'brazilian',
        'korean', 'japanese', 'german', 'haitian', 'taiwanese', 'filipino', 'south_african',
        'jamaican', 'american', 'bbq', 'chinese', 'french', 'caribbean', 'vietnamese', 'fusion',
        'cuban', 'african', 'british', 'thai', 'puerto_rican', 'dominican', 'greek', 'indian',
        'seafood', 'middle_eastern', 'mexican', 'italian', 'soul_food']
        
    y = {}
    for t in tags:
        y[t] = get_data(t)
        y[t]["tag"] = t

    #Eine Liste aller Dataframes erstellen
    listOfDataframes = list(y.keys())
    
    # Der Loop geht über alle Keys des Dictionary und fügt die Values (Dataframes) zusammen
    listOfKeys = []
    for i in listOfDataframes:
        listOfKeys.append(y[i])
    df_total = pd.concat(listOfKeys)
    
    df = creat_subset(df_total)
    
    #enable datatype of additional_ifnormation to be in json form
    df['topics'] = df['topics'].apply(json.dumps)
    df['tags'] = df['tags'].apply(json.dumps)
    df['credits'] = df['credits'].apply(json.dumps)
    df['sections'] = df['sections'].apply(json.dumps)
    df['instructions'] = df['instructions'].apply(json.dumps)
    
    # A long string that contains the necessary Postgres login information
    conn = psycopg2.connect("host={} dbname={} user={} password={}".format(ipaddress, dbname, username, password))
    
    # Curser for connection
    cur = conn.cursor()
    engine = create_engine("postgresql://{}:{}@{}:5432/{}".format(username,password,ipaddress,dbname))
    
    #creating table if not existent
    conn.set_session(autocommit=True)
    cur.execute("CREATE TABLE IF NOT EXISTS tasty \
    (name TEXT, original_video_url TEXT, topics jsonb, keywords TEXT, tags jsonb, num_servings NUMERIC, \
    total_time_minutes NUMERIC, yields TEXT, country TEXT, tips_and_ratings_enabled BOOLEAN, aspect_ratio TEXT, \
    credits jsonb, sections jsonb, instructions jsonb, id INT, prep_time_minutes NUMERIC, description TEXT, \
    cook_time_minutes NUMERIC, nutrition_fiber NUMERIC, nutrition_protein NUMERIC, nutrition_fat NUMERIC, \
    nutrition_calories NUMERIC, nutrition_sugar NUMERIC, nutrition_carbohydrates NUMERIC, \
    user_ratings_count_positive NUMERIC, user_ratings_score NUMERIC, user_ratings_count_negative NUMERIC, \
    total_time_tier_display_tier TEXT);")
  
    #adding the df into the database
    df.to_sql('tasty', engine, if_exists = 'append', index = False)
    
    #delete duplicates
    sql = """    
        DELETE FROM
        tasty x
        USING tasty y
        WHERE
        x.id = y.id
        AND x.ctid > y.ctid;
    """
    #cur.execute(sql)
    
    #close the connections
    cur.close()
    conn.close()
    
    print("Data was successfully loaded into the DB.")
    

def get_data(tag):
    
    # credentials for Tasty
    headers = {
    'x-rapidapi-host': "tasty.p.rapidapi.com",
    'x-rapidapi-key': "362fc9c239mshcfe50bd2bfb56f4p1ca4e5jsncbaacdf9bb2a"
    }
    
    # query to select data from API
    url = "https://tasty.p.rapidapi.com/recipes/list"

    querystring = {"from":"0","size":"500","tags":tag}

    response = requests.request("GET", url, headers=headers, params=querystring)
    results = response.json()['results']
    results = pd.json_normalize(results)
    
    # create data frame
    df_recipes = pd.DataFrame(results)
    
    return df_recipes

    
def creat_subset(df):
    
    df = df[df['instructions'].notna()]
    
    # create subset
    df = df[['name',
     'original_video_url',
     'topics',
     'keywords',
     'tags',
     'num_servings',
     'total_time_minutes',
     'yields',
     'country',
     'tips_and_ratings_enabled',
     'aspect_ratio',
     'credits',
     'sections',
     'instructions',
     'id',
     'prep_time_minutes',
     'description',
     'cook_time_minutes',
     'nutrition.fiber',
     'nutrition.protein',
     'nutrition.fat',
     'nutrition.calories',
     'nutrition.sugar',
     'nutrition.carbohydrates',
     'user_ratings.count_positive',
     'user_ratings.score',
     'user_ratings.count_negative',
     'total_time_tier.display_tier']]
     
    df = df.rename(columns={'nutrition.fiber': 'nutrition_fiber', 'nutrition.protein': 'nutrition_protein', 'nutrition.fat': 'nutrition_fat', \
        'nutrition.calories': 'nutrition_calories', 'nutrition.sugar': 'nutrition_sugar', 'nutrition.carbohydrates': 'nutrition_carbohydrates', \
        'user_ratings.count_positive': 'user_ratings_count_positive', 'user_ratings.score': 'user_ratings_score', 'user_ratings.count_negative': 'user_ratings_count_negative', \
        'total_time_tier.display_tier': 'total_time_tier_display_tier'})
    
    return df