<a href="https://colab.research.google.com/github/yuki-shi/pokedex-flask/blob/main/serebii_scraping_johto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports!! 🐈

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
import re

## Um scrape **daqueles**!! 🐐

In [2]:
def scrape_serebii(gen):
  
  url = f'https://serebii.net/pokemon/gen{gen}pokemon.shtml'
  soup = BeautifulSoup(requests.get(url).text, 'html.parser')

  header = []
  h = soup.find_all('td', class_='fooevo')

  for i in h:
    header.append(i.text)

  header = [x.strip('\r\n\t') for x in header]
  header.remove('Base Stats')

  poke_dict = OrderedDict.fromkeys(header)

  for i in poke_dict.keys():
    poke_dict[i] = []

  tbody = soup.find_all('tr')
  tbody = tbody[2:]

  for index, tr in enumerate(tbody):
    if index % 2 == 0:
      for i, key in enumerate(poke_dict.keys()):
        poke_dict.setdefault(key, []).append(tr.find_all('td', class_='fooinfo')[i].text.strip('\r\n\t'))

  df = pd.DataFrame(poke_dict)
  return df

In [3]:
def format_id(id):
  
  if len(id) == 1:
    return re.sub(r'([0-9]{1})', r'00\1', id)
  elif len(id) == 2:  
    return re.sub(r'([0-9]{2})', r'0\1', id)
  else:
    return id

In [4]:
def get_johtodex():
  
  url = f'https://www.serebii.net/heartgoldsoulsilver/johtodex.shtml'
  soup = BeautifulSoup(requests.get(url).text, 'html.parser')

  poke_dict = OrderedDict()

  tbody = soup.find_all('tr')
  tbody = tbody[1:] #aqui a tablea não é pivotada

  for index, tr in enumerate(tbody):
    poke_dict[index+1] = tr.find_all('td')[2].text.strip('\r\n\t')

  df = pd.DataFrame([poke_dict]).transpose().reset_index()
  df.rename(columns={'index':'JohtoID', 0: 'Name'}, inplace=True)

  return df

In [5]:
def get_stats(id_list):

  stats_dict = OrderedDict()

  for id in id_list:
    try:
      url = f'https://serebii.net/pokedex-dp/{id}.shtml'
      soup = BeautifulSoup(requests.get(url).text, 'html.parser')

      last_table = soup.find_all(class_='dextable')[-1]

      for td in last_table.find_all('td', class_=('fooinfo'))[1:7]:
        stats_dict.setdefault(id, []).append(td.text)

    except:
      raise Exception(f'O id {id} deu ruim!')

  return(pd.DataFrame(stats_dict).transpose()
                                 .reset_index()
                                 .rename(columns={'index':'id', 0:'HP', 1:'Attk', 2:'Defense', 3:'Sp.Attk', 4:'Sp.Def', 5:'Spd'}))

In [8]:
nationalID = [format_id(str(x)) for x in range(1,252)] # criação do ID nacional
nationalID.append('469') # yanmega é um outlier por si só, teremos q adcioná-lo a mão

df_stats = get_stats(nationalID) # stats com base no ID nacional, pegando os status especificamente da Gen IV

df_johto = get_johtodex() # pokedex de Johto

df_gen = [] # pokedex nacional

for i in range(1,5):
  df_gen.append(scrape_serebii(i))

## Joins joins joins!! 🐌

In [9]:
df_national = pd.concat([df_gen[0], df_gen[1], df_gen[2], df_gen[3]],axis=0, join='outer', ignore_index=True)

In [10]:
df_national.rename(columns={'No.':'NatID'}, inplace=True)
df_national.query('Name == "Chikorita"')

Unnamed: 0,NatID,Pic,Name,Type,Abilities,HP,Att,Def,S.Att,S.Def,Spd
151,#152,,Chikorita,,Overgrow Leaf Guard,45,49,65,49,65,45


In [11]:
print(f'Nacional: {df_national.shape[0]}, Johto: {df_johto.shape[0]}')

Nacional: 493, Johto: 256


In [12]:
df_johto.loc[df_johto['Name'] == 'Ho-oh', 'Name'] = 'Ho-Oh'

In [13]:
df_johto2 = df_johto.join(df_national.set_index('Name'), on='Name', how='left')

In [14]:
df_johto2[df_johto2['NatID'].isnull()]

Unnamed: 0,JohtoID,Name,NatID,Pic,Type,Abilities,HP,Att,Def,S.Att,S.Def,Spd
94,95,Nidoran (F),,,,,,,,,,
97,98,Nidoran (M),,,,,,,,,,


In [15]:
df_national[df_national['Name'].str.contains('Nidoran')]

Unnamed: 0,NatID,Pic,Name,Type,Abilities,HP,Att,Def,S.Att,S.Def,Spd
28,#029,,Nidoran♀,,Poison Point Rivalry Hustle,55,47,52,40,40,41
31,#032,,Nidoran♂,,Poison Point Rivalry Hustle,46,57,40,40,40,50


In [16]:
df_johto2.loc[df_johto2['Name'] == 'Nidoran (F)', 'NatID'] = '#029'
df_johto2.loc[df_johto2['Name'] == 'Nidoran (M)', 'NatID'] = '#032'

In [17]:
df_johto2[df_johto2['NatID'].isnull()]

Unnamed: 0,JohtoID,Name,NatID,Pic,Type,Abilities,HP,Att,Def,S.Att,S.Def,Spd


In [18]:
df_johto2 = df_johto2[['JohtoID', 'NatID', 'Name']]
df_johto2.head(10)

Unnamed: 0,JohtoID,NatID,Name
0,1,#152,Chikorita
1,2,#153,Bayleef
2,3,#154,Meganium
3,4,#155,Cyndaquil
4,5,#156,Quilava
5,6,#157,Typhlosion
6,7,#158,Totodile
7,8,#159,Croconaw
8,9,#160,Feraligatr
9,10,#016,Pidgey


In [19]:
df_johto2['NatID'] = df_johto2['NatID'].str.replace('#', '').astype(int)
df_stats['id'].dtype
df_stats['id'] = df_stats['id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
df_johto3 = df_johto2.join(df_stats.set_index('id'), on='NatID', how='outer')

## API !!! 🐿

### Call

In [21]:
def get_pokeapi(df, id):

  ids = df[id]

  for index, id in enumerate(ids):

    r = requests.get(f'https://pokeapi.co/api/v2/pokemon/{id}')
    json = r.json() if r and r.status_code == 200 else None

    try:

      if json['past_types']:
        for slot in json['past_types']:
          for i, key in enumerate(slot['types']):
            if len(slot['types']) == 1:
              df.loc[index, 'Type 1'] = key['type']['name']
            else:
              df.loc[index, f'Type {i+1}'] = key['type']['name']

      else:
        for i, key in enumerate(json['types']):
          if len(json['types']) == 1:
            df.loc[index, 'Type 1'] = key['type']['name']
          else:
            df.loc[index, f'Type {i+1}'] = key['type']['name']

    except:
      
      raise Exception(f'O pokemon {json["name"]} #{id} deu ruim!')

  return df

In [22]:
df_pkm = get_pokeapi(df_johto3, 'NatID')
df_pkm[(df_pkm['Type 1'] == 'fairy') | (df_pkm['Type 2'] == 'fairy')]

Unnamed: 0,JohtoID,NatID,Name,HP,Attk,Defense,Sp.Attk,Sp.Def,Spd,Type 1,Type 2


In [23]:
df_pkm.head(10)

Unnamed: 0,JohtoID,NatID,Name,HP,Attk,Defense,Sp.Attk,Sp.Def,Spd,Type 1,Type 2
0,1,152,Chikorita,45,49,65,49,65,45,grass,
1,2,153,Bayleef,60,62,80,63,80,60,grass,
2,3,154,Meganium,80,82,100,83,100,80,grass,
3,4,155,Cyndaquil,39,52,43,60,50,65,fire,
4,5,156,Quilava,58,64,58,80,65,80,fire,
5,6,157,Typhlosion,78,84,78,109,85,100,fire,
6,7,158,Totodile,50,65,64,44,48,43,water,
7,8,159,Croconaw,65,80,80,59,63,58,water,
8,9,160,Feraligatr,85,105,100,79,83,78,water,
9,10,16,Pidgey,40,45,40,35,35,56,normal,flying


In [24]:
df_pkm.query('Name == "Yanmega"')

Unnamed: 0,JohtoID,NatID,Name,HP,Attk,Defense,Sp.Attk,Sp.Def,Spd,Type 1,Type 2
101,102,469,Yanmega,86,76,86,116,56,95,bug,flying


In [25]:
df_pkm.to_csv('pokemon-johto.csv', index=False)