In [1]:
from datetime import datetime, timedelta
import requests
import pandas as pd
import boto3
import os
from dotenv import load_dotenv
import io

In [3]:
 # retrieve the data from the API
url = "https://pokeapi.co/api/v2/pokemon?limit=10000"
response = requests.get(url, timeout=1000)
if response.status_code == 200:
    data = response.json()["results"]
    pokemon_list = []
    for pokemon in data:
        pokemon_url = pokemon["url"]
        response = requests.get(pokemon_url, timeout=1000)
        if response.status_code == 200:
            pokemon_data = response.json()
            types = [t["type"]["name"] for t in pokemon_data["types"]]
            pokemon_dict = {
                "id": pokemon_data["id"],
                "name": pokemon_data["name"],
                "height": pokemon_data["height"],
                "weight": pokemon_data["weight"],
                "types": types,
            }
            pokemon_list.append(pokemon_dict)
else:
    print(f"Error: HTTP status code {response.status_code}")

In [88]:
 # create a DataFrame from the data
df = pd.DataFrame(pokemon_list)

# explode the "types" column and select relevant columns
df = df.explode("types")


In [35]:
load_dotenv(override=True)

print(os.getenv("AWS_ACCESS_KEY_ID"))
print(os.getenv("AWS_SECRET_ACCESS_KEY"))
print(os.getenv("AWS_S3_BUCKET"))



AKIA6QXGBFMYB7JAA3BB
6iXdYwG/VvjYMqtXw/Dm45TkyYHpr7mfxb3Vq7tr
pokemon-de-zoomcamp-project


In [36]:
session = boto3.Session(
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
)
s3 = session.client("s3")


csv_buffer = io.StringIO()
df.to_csv(csv_buffer)
csv_data = csv_buffer.getvalue().encode()

s3.put_object(Bucket="pokemon-de-zoomcamp-project", Key="data/pokemon_info.csv", Body=csv_data)

{'ResponseMetadata': {'RequestId': '2D4QVS0MCBQV6VYX',
  'HostId': 'BhuRhYLw+D99/gil2+3Th5W+1qn1qa3bTcTymbaF2BRdh+IsEYaMN3wv/6WvAKHp2LVs9AGKDug=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'BhuRhYLw+D99/gil2+3Th5W+1qn1qa3bTcTymbaF2BRdh+IsEYaMN3wv/6WvAKHp2LVs9AGKDug=',
   'x-amz-request-id': '2D4QVS0MCBQV6VYX',
   'date': 'Mon, 03 Apr 2023 10:11:20 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"0cc18cab62c4673af21c1a822cc4ccac"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"0cc18cab62c4673af21c1a822cc4ccac"',
 'ServerSideEncryption': 'AES256'}

In [3]:
# retrieve the data from the API
url = "https://pokeapi.co/api/v2/generation?limit=10000"
response = requests.get(url, timeout=1000)
if response.status_code == 200:
    data = response.json()["results"]
    generations = []
    for generation in data:
        generation_url = generation["url"]
        response = requests.get(generation_url, timeout=1000)
        if response.status_code == 200:
            pokemon_data = response.json()["pokemon_species"]
            pokemon_names = [pokemon["name"] for pokemon in pokemon_data]
            generations.append(
                {
                    "generation_name": generation["name"],
                    "pokemon_names": pokemon_names,
                }
            )
else:
    print(f"Error: HTTP status code {response.status_code}")

In [5]:
# create a DataFrame from the data
df = pd.DataFrame(generations)

# explode the "types" column and select relevant columns
df = df.explode("pokemon_names")

df.head()

Unnamed: 0,generation_name,pokemon_names
0,generation-i,bulbasaur
0,generation-i,charmander
0,generation-i,squirtle
0,generation-i,caterpie
0,generation-i,weedle


In [37]:
# Retrieve the data from the API
response = requests.get("https://pokeapi.co/api/v2/move?limit=10000", timeout=1000)
if response.status_code == 200:
    data = response.json()["results"]

    # Create a dictionary to store the data
    moves_dict = {}

    # Loop through the data and add it to the dictionary
    for move in data:
        move_id = int(move["url"].split("/")[-2])
        move_response = requests.get(move["url"], timeout=1000)
        move_data = move_response.json()
        move_name = move_data["name"]
        move_accuracy = move_data["accuracy"]

        # Loop through the list of pokemon that can learn this move
        for pokemon in move_data["learned_by_pokemon"]:
            pokemon_name = pokemon["name"]
            if pokemon_name not in moves_dict:
                moves_dict[pokemon_name] = []
            moves_dict[pokemon_name].append((move_id, move_name, move_accuracy))

    # Convert the dictionary to a Pandas dataframe
    data = []
    for pokemon_name, moves in moves_dict.items():
        for move in moves:
            data.append((pokemon_name, move[0], move[1], move[2]))
    df = pd.DataFrame(
        data,
        columns=["learned_by_pokemon", "move_id", "move_name", "move_accuracy"],
    )

   
else:
    print(f"Error: HTTP status code {response.status_code}")

In [43]:
 # create a DataFrame from the data
#df = pd.DataFrame(moves_dict)

df.head()
# explode the "pokemon_names" column and select relevant columns
#df = df.explode("pokemon_names")

#csv_buffer = io.StringIO()
#df.to_csv(csv_buffer)
#csv_data = csv_buffer.getvalue().encode()

# s3.put_object(
#     Bucket="pokemon-de-zoomcamp-project",
#     Key="data/pokemon_moves.csv",
#     Body=csv_data,
# )

Unnamed: 0,learned_by_pokemon,move_id,move_name,move_accuracy
0,clefairy,1,pound,100.0
1,clefairy,3,double-slap,85.0
2,clefairy,5,mega-punch,85.0
3,clefairy,7,fire-punch,100.0
4,clefairy,8,ice-punch,100.0


In [7]:
df.head()

Unnamed: 0,learned_by_pokemon,move_id,move_name,move_accuracy
0,clefairy,1,pound,100.0
1,clefairy,3,double-slap,85.0
2,clefairy,5,mega-punch,85.0
3,clefairy,7,fire-punch,100.0
4,clefairy,8,ice-punch,100.0


In [8]:
# retrieve the data from the API
url = "https://pokeapi.co/api/v2/pokemon-habitat?limit=3000"
response = requests.get(url, timeout=1000)
if response.status_code == 200:
    data = response.json()["results"]
    habitat_list = []
    for habitat in data:
        habitat_url = habitat["url"]
        response = requests.get(habitat_url, timeout=1000)
        if response.status_code == 200:
            habitat_data = response.json()
            for pokemon in habitat_data["pokemon_species"]:
                pokemon_dict = {
                    "habitat": habitat_data["name"],
                    "pokemon_name": pokemon["name"],
                }
                habitat_list.append(pokemon_dict)
else:
    print(f"Error: HTTP status code {response.status_code}")

In [9]:
# create a DataFrame from the data
df = pd.DataFrame(habitat_list)

df.head()

Unnamed: 0,habitat,pokemon_name
0,cave,zubat
1,cave,diglett
2,cave,gastly
3,cave,onix
4,cave,misdreavus


In [10]:
# retrieve the data from the API
url = "https://pokeapi.co/api/v2/pokemon?limit=10000"
response = requests.get(url, timeout=1000)
if response.status_code == 200:
    data = response.json()["results"]
    pokemon_list = []
    for pokemon in data:
        pokemon_url = pokemon["url"]
        response = requests.get(pokemon_url, timeout=1000)
        if response.status_code == 200:
            pokemon_data = response.json()
            base_stats = [bs["stat"]["name"] for bs in pokemon_data["stats"]]
            pokemon_dict = {
                "id": pokemon_data["id"],
                "name": pokemon_data["name"],
                "base_stats": base_stats,
            }
            pokemon_list.append(pokemon_dict)

else:
    print(f"Error: HTTP status code {response.status_code}")

In [11]:
# create a DataFrame from the data
df = pd.DataFrame(pokemon_list)

# explode the "types" column and select relevant columns
df = df.explode("base_stats")

df.head()

Unnamed: 0,id,name,base_stats
0,1,bulbasaur,hp
0,1,bulbasaur,attack
0,1,bulbasaur,defense
0,1,bulbasaur,special-attack
0,1,bulbasaur,special-defense


In [50]:
load_dotenv("/workspaces/data-engineer-zoomcamp-project/airflow/.env")





In [None]:
# UPLOAD TO REDSHIFT

In [2]:
# Create Redshift client
redshift = boto3.client(
    "redshift",
    region_name = "us-east-1",
    aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
)

In [3]:
import psycopg2

In [4]:
# Define connection string
conn_string = "postgresql://{}:{}@{}:{}/{}".format(
    os.getenv("REDSHIFT_USER"),
    os.getenv("REDSHIFT_PW"),
    os.getenv("AWS_REDSHIFT_HOST"),
    os.getenv("REDSHIFT_PORT"),
    os.getenv("REDSHIFT_DB")
)

# Connect to the database
conn = psycopg2.connect(conn_string)

OperationalError: invalid integer value "None" for connection option "port"


In [1]:

def execute_sql(
        sql_query,
        conn_string,
        print_results = False
    ):
    """Execute a SQL query on the database associated with
       a connection string
    
    Parameters:
    - sql_query : str
        SQL query to execute
    - conn_string : str
        connection string of the format 'postgresql://MasterUsername:MasterUserPassword@ClusterEndpoint:DatabasePort,DatabaseName'
    - print_results : bool
        select if to print query results or not
    """
    
    # Connect to the database
    conn = psycopg2.connect(conn_string)
    
    # Define cursor
    cur = conn.cursor()
    
    # Execute query
    cur.execute(sql_query)
    conn.commit()
    if print_results:
        print(cur.fetchall())

    # Close cursor
    cur.close()
    
    # Close connection
    conn.close()

In [54]:
aws_s3_bucket = os.getenv("AWS_S3_BUCKET")


session = boto3.Session(
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
)
s3 = session.client("s3")


# Get object containing file to be staged
obj = s3.get_object(
    Bucket = aws_s3_bucket,
    Key = "data/pokemon_info.csv"
)

import io
import pandas as pd

# Print colummns info for the dataset
pd.read_csv(io.BytesIO(obj["Body"].read())).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990 entries, 0 to 1989
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1990 non-null   int64 
 1   id          1990 non-null   int64 
 2   name        1990 non-null   object
 3   height      1990 non-null   int64 
 4   weight      1990 non-null   int64 
 5   types       1990 non-null   object
dtypes: int64(4), object(2)
memory usage: 93.4+ KB
