### Parse the Pokemon

In [1]:
import requests
from bs4 import BeautifulSoup
# Get links to parse for 809 pokemon
urls = ['https://serebii.net/pokedex-sm/' + ('0' * (3 - len(str(x)))) + 
        str(x) + '.shtml' for x in range(1, 810)]

In [2]:
import re
def find_name(soup):
    """Function to parse the name and the number of the pokemon"""
    title = soup.find('title').string
    match = re.search('(.*?) - #([0-9]{3})', title)
    return [match.group(1), match.group(2)]

In [3]:
def find_abilities(soup):
    """Function to parse the abilities of the pokemon"""
    stop_words = ['Abiliti', 'Stat', 'Form', '-', 'Rotom', 
                  'Hoopa', 'Necrozma', 'Type', 'Style']
    abilities = []
    tag = soup.find('i', string=re.compile('\(Hidden'))
    if tag is not None: 
        tag = tag.find_previous('b')
        while (tag.string is not None) and ('Abilit' not in tag.string):
            if tag.string not in abilities:
                abilities.append(tag.string)
            tag = tag.find_previous('b')
    else:
        tag = soup.find('b', string='Abilities').find_next('b')
        while (tag.string is not None) \
            and all(x not in tag.string for x in stop_words):
            if tag.string not in abilities:
                abilities.append(tag.string)
            tag = tag.find_next('b')
    return abilities

In [4]:
def find_type(soup):
    """Function to parse the types of the pokemon"""
    result = []
    types = soup.find('td', string='Type').find_parent('tr'). \
            find_next('tr').find_all('a')
    types = [t.get('href') for t in types if 'shtml' in t.get('href')]
    if len(types) > 2:
        types = soup.find('td', string=re.compile('Type')).\
        find_parent('tr').find_next('tr').find_next('td').find_all('a')
        types = [t.get('href') for t in types if 'shtml' in t.get('href')]
    for li in types:
        match = re.search('.*/(.*)\.shtml', li)
        result.append(match.group(1))
    return result

In [5]:
def find_stats(soup):
    """Function to parse the stats of the pokemon"""
    stats = soup.find('td', string='Speed').find_parent('tr').\
        find_next('tr').find_all('td')
    result = [stat.string for stat in stats[1:]]
    return result

In [6]:
def parse(url):
    """Main parse function"""
    special_type = {'Wormadam':['bug', 'grass'],
                    'Rotom':['electric', 'ghost'],
                    'Shaymin':['grass'],
                    'Darmanitan': ['fire'],
                    'Meloetta': ['normal', 'psychic'],
                    'Hoopa': ['psychic', 'ghost'],
                    'Necrozma': ['psychic'],
                    'Oricorio': ['fire', 'flying']
                   }
    special_ability = {'Shaymin':['Natural Cure'],
                       'Kyurem':['Pressure'],
                       'Genesect':['Download'],
                       'Basculin':['Reckless', 'Adaptability', 'Mold Breaker'],
                       'Meowstic':['Infiltrator', 'Prankster', 'Competitive']
                      }
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    pokemon = []
    name = find_name(soup)
    pokemon.extend(name)
    if name[0] in special_type:
        pokemon.append(special_type[name[0]])
    else:
        pokemon.append(find_type(soup))
    if name[0] in special_ability:
        pokemon.append(special_ability[name[0]])
    else:
        pokemon.append(find_abilities(soup))
    pokemon.extend(find_stats(soup))
    return pokemon

In [7]:
pokemon_data = []
for url in urls:
    pokemon_data.append(parse(url))

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


### Parse the alternate forms of pokemon

In [8]:
# Deoxys
url = 'https://www.serebii.net/pokedex-sm/386.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
formes = ['Attack', 'Defense', 'Speed']
types = find_type(soup)
abilities = find_abilities(soup)
for forme in formes:
    deoxys = ['Deoxys-'+forme, '386']
    deoxys.append(types)
    deoxys.append(abilities)
    stats = soup.find('td', string='Stats - '+forme+' Forme'). \
            find_parent('tr').find_next('tr').find_next('tr').find_all('td')
    stats = [stat.string for stat in stats[1:]]
    deoxys.extend(stats)
    pokemon_data.append(deoxys)

In [9]:
# Wormadam
url = 'https://www.serebii.net/pokedex-sm/413.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
cloaks = ['Sandy', 'Trash']
abilities = find_abilities(soup)
for cloak in cloaks:
    wormadam = ['Wormadam-'+cloak, '413']
    links = soup.find('td', string=cloak+' Cloak').find_parent('tr').find_all('a')
    types = []
    for li in links:
        match = re.search('.*/(.*)\.shtml', li.get('href'))
        types.append(match.group(1))
    wormadam.append(types)
    wormadam.append(abilities)
    stats = soup.find('td', string='Stats - '+cloak+' Cloak')\
        .find_parent('tr').find_next('tr').find_next('tr').find_all('td')
    stats = [stat.string for stat in stats[1:]]
    wormadam.extend(stats)
    pokemon_data.append(wormadam)

In [10]:
# Rotom
url = 'https://www.serebii.net/pokedex-sm/479.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
forms = ['Frost', 'Heat', 'Mow', 'Fan', 'Wash']
abilities = find_abilities(soup)
stats = soup.find('td', string='Stats - Alternate Forms').find_parent('tr') \
        .find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
for form in forms:
    rotom = ['Rotom-'+form, '479']
    links = soup.find('td', string=form+' Rotom').find_parent('tr').find_all('a')
    types = []
    for li in links:
        match = re.search('.*/(.*)\.shtml', li.get('href'))
        types.append(match.group(1))
    rotom.append(types)
    rotom.append(abilities)
    rotom.extend(stats)
    pokemon_data.append(rotom)

In [11]:
# Giratina
url = 'https://www.serebii.net/pokedex-sm/487.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
giratina = ['Giratina-Origin', '487']
types = find_type(soup)
giratina.append(types)
ability = ['Levitate']
giratina.append(ability)
stats = soup.find('td', string='Stats - Origin Forme').find_parent('tr'). \
        find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
giratina.extend(stats)
pokemon_data.append(giratina)

In [12]:
# Shaymin
url = 'https://www.serebii.net/pokedex-sm/492.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
shaymin = ['Shaymin-Sky', '492']
links = soup.find('td', string='Sky Forme').find_parent('tr').find_all('a')
types = []
for li in links:
    match = re.search('.*/(.*)\.shtml', li.get('href'))
    types.append(match.group(1))
shaymin.append(types)
ability = ['Serene Grace']
shaymin.append(ability)
stats = soup.find('td', string='Stats - Sky Forme').find_parent('tr') \
        .find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
shaymin.extend(stats)
pokemon_data.append(shaymin)

In [13]:
# Darmanitan
url = 'https://www.serebii.net/pokedex-sm/555.shtml'
darmanitan = ['Darmanitan-Zen', '555']
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find('td', string='Zen Mode').find_parent('tr').find_all('a')
types = []
for li in links:
    match = re.search('.*/(.*)\.shtml', li.get('href'))
    types.append(match.group(1))
darmanitan.append(types)
ability = ['Zen Mode']
darmanitan.append(ability)
stats = soup.find('td', string='Stats - Zen Mode').find_parent('tr') \
        .find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
darmanitan.extend(stats)
pokemon_data.append(darmanitan)

In [14]:
# Tornadus, Landorus, Thundurus
nums = [641, 642, 645]
urls = ['https://serebii.net/pokedex-sm/'+ str(x)+'.shtml' for x in nums]
for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    pokemon = []
    name = find_name(soup)
    pokemon.extend([name[0]+'-Therian', name[1]])
    pokemon.append(find_type(soup))
    ability = soup.find('b', string='Therian Forme Ability').find_next('b').text
    pokemon.append([ability])
    stats = soup.find('td', string='Stats - Therian Forme').find_parent('tr')\
            .find_next('tr').find_next('tr').find_all('td')
    stats = [stat.string for stat in stats[1:]]
    pokemon.extend(stats)
    pokemon_data.append(pokemon)

In [15]:
# Kyurem
url = 'https://www.serebii.net/pokedex-sm/646.shtml'
forms = ['Black', 'White']
r = requests.get(url)
soup = BeautifulSoup(r.content)
for form in forms:
    pokemon = []
    name = find_name(soup)
    pokemon.extend([name[0]+'-'+form, name[1]])
    pokemon.append(find_type(soup))
    ability = soup.find('b', string=form+' Kyurem Ability').find_next('b').text
    pokemon.append([ability])
    stats = soup.find('td', string='Stats - '+form+' Kyurem').find_parent('tr')\
            .find_next('tr').find_next('tr').find_all('td')
    stats = [stat.string for stat in stats[1:]]
    pokemon.extend(stats)
    pokemon_data.append(pokemon)

In [16]:
# Meloetta
url = 'https://www.serebii.net/pokedex-sm/648.shtml'
meloetta = ['Meloetta-Pirouette', '648']
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find('td', string='Pirouette Forme').find_parent('tr').find_all('a')
types = []
for li in links:
    match = re.search('.*/(.*)\.shtml', li.get('href'))
    types.append(match.group(1))
meloetta.append(types)
meloetta.append(find_abilities(soup))
stats = soup.find('td', string='Stats - Pirouette Forme').find_parent('tr').\
        find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
meloetta.extend(stats)
pokemon_data.append(meloetta)

In [17]:
# Greninja
url = 'https://www.serebii.net/pokedex-sm/658.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
pokemon = ['Greninja-Ash', '658']
pokemon.append(find_type(soup))
ability = soup.find('b', string='Ash-Greninja Ability').find_next('b').text
pokemon.append([ability])
stats = soup.find('td', string='Stats - Ash-Greninja').find_parent('tr')\
        .find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
pokemon.extend(stats)
pokemon_data.append(pokemon)

In [18]:
# Pumpkaboo & Gourgeist
nums = [710, 711]
urls = ['https://serebii.net/pokedex-sm/'+ str(x)+'.shtml' for x in nums]
sizes = ['Small', 'Large', 'Super']
for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    for size in sizes:
        pokemon = []
        name = find_name(soup)
        pokemon.extend([name[0]+'-'+size, name[1]])
        pokemon.append(find_type(soup))
        pokemon.append(find_abilities(soup))
        stats = soup.find('td', string='Stats - '+size+' Size').\
                find_parent('tr').find_next('tr').find_next('tr').find_all('td')
        stats = [stat.string for stat in stats[1:]]
        pokemon.extend(stats)
        pokemon_data.append(pokemon)

In [19]:
# Oricorio
url = 'https://www.serebii.net/pokedex-sm/741.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
styles = ['Pom-Pom', 'Pa\'u', 'Sensu']
for style in styles:
    oricorio = ['Oricorio-'+style, '741']
    links = soup.find('td', string=style+' Style').find_parent('tr').find_all('a')
    types = []
    for li in links:
        match = re.search('.*/(.*)\.shtml', li.get('href'))
        types.append(match.group(1))
    oricorio.append(types)
    oricorio.append(find_abilities(soup))
    oricorio.extend(find_stats(soup))
    pokemon_data.append(oricorio)

In [20]:
# Lycanroc
url = 'https://serebii.net/pokedex-sm/745.shtml'
forms = ['Midnight', 'Dusk']
r = requests.get(url)
soup = BeautifulSoup(r.content)
for form in forms:
    pokemon = ['Lycanroc-'+form, '745']
    pokemon.append(find_type(soup))
    if form=='Midnight':
        ability = ['Keen Eye', 'Vital Spirit', 'No Guard']
    else:
        ability = ['Tough Claws']
    pokemon.append(ability)
    stats = soup.find('td', string='Stats - '+form+' Form').find_parent('tr')\
            .find_next('tr').find_next('tr').find_all('td')
    stats = [stat.string for stat in stats[1:]]
    pokemon.extend(stats)
    pokemon_data.append(pokemon)

In [21]:
# Wishiwashi
url = 'https://www.serebii.net/pokedex-sm/746.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
pokemon = ['Wishiwashi-School', '746']
pokemon.append(find_type(soup))
pokemon.append(find_abilities(soup))
stats = soup.find('td', string='Stats - School Form').find_parent('tr')\
        .find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
pokemon.extend(stats)
pokemon_data.append(pokemon)

In [22]:
# Minior
url = 'https://www.serebii.net/pokedex-sm/774.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
pokemon = ['Minior-Meteor', '774']
pokemon.append(find_type(soup))
pokemon.append(find_abilities(soup))
stats = soup.find('td', string='Stats - Cores').find_parent('tr')\
        .find_next('tr').find_next('tr').find_all('td')
stats = [stat.string for stat in stats[1:]]
pokemon.extend(stats)
pokemon_data.append(pokemon)

In [23]:
# Necrozma
url = 'https://www.serebii.net/pokedex-sm/800.shtml'
forms = ['Dusk Mane', 'Dawn Wings']
r = requests.get(url)
soup = BeautifulSoup(r.content)
for form in forms:
    pokemon = ['Necrozma-'+form.replace(' ', '-'), '800']
    links = soup.find('td', string=form).find_parent('tr').find_all('a')
    types = []
    for li in links:
        match = re.search('.*/(.*)\.shtml', li.get('href'))
        types.append(match.group(1))
    pokemon.append(types)
    pokemon.append(find_abilities(soup))
    stats = soup.find('td', string='Stats - '+form+' Necrozma')\
            .find_parent('tr').find_next('tr').find_next('tr').find_all('td')
    stats = [stat.string for stat in stats[1:]]
    pokemon.extend(stats)
    pokemon_data.append(pokemon)
    
necrozma_ultra = ['Necrozma-Ultra', '800',
                  ['psychic', 'dragon'], ['Neuroforce'],
                  '97', '167', '97', '167', '97', '129']
pokemon_data.append(necrozma_ultra)

In [24]:
# Castform
url = 'https://www.serebii.net/pokedex-sm/351.shtml'
r = requests.get(url)
soup = BeautifulSoup(r.content)
forms = ['Sunny', 'Rainy', 'Snowy']
for form in forms:
    castform = ['Castform-'+form, '351']
    links = soup.find('td', string=form+' Form').find_parent('tr').find_all('a')
    types = []
    for li in links:
        match = re.search('.*/(.*)\.shtml', li.get('href'))
        types.append(match.group(1))
    castform.append(types)
    castform.append(find_abilities(soup))
    castform.extend(find_stats(soup))
    pokemon_data.append(castform)

In [25]:
# Arceus, Silvally
nums = [493, 773]
urls = ['https://serebii.net/pokedex-sm/'+ str(x)+'.shtml' for x in nums]
types = ['Bug', 'Dark', 'Dragon', 'Electric', 
         'Fairy', 'Fire', 'Fighting', 'Flying', 
         'Ghost', 'Grass', 'Ground', 'Ice',
         'Poison', 'Psychic', 'Rock', 'Steel', 'Water']
for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    name = find_name(soup)
    ability = find_abilities(soup)
    stats = find_stats(soup)
    for t in types:
        pokemon = [name[0]+'-'+t, name[1]]
        pokemon.append([t.lower()])
        pokemon.append(ability)
        pokemon.extend(stats)
        pokemon_data.append(pokemon)

### Parse Mega Evolutions and Alolan Forms

In [26]:
def find_mega(tag):
    features = tag.find_all('a')
    if len(features) == 0:
        return None
    # Get name
    name = re.search('[a-zA-Z ]*', features[1].text).group(0)
    if ' ' in name:
        name = name.split(' ')
        name[0], name[1] = name[1], name[0]
        name = '-'.join(name)
    num = re.search('/.*/(.*)\.shtml', features[1].get('href')).group(1)
    result = [name, num]
    abilities = []
    types = []
    for f in features[2:]:
        li = f.get('href')
        if 'ability' in li:
            abilities.append(f.text)
        else:
            match = re.search('.*/(.*)\.shtml', li)
            types.append(match.group(1))
    result.append(types)
    result.append(abilities)
    stats = tag.find_all('td')[-7:-1]
    stats = [i.string for i in stats]
    if None in stats:
        stats = tag.find_all('td')[-6:]
        stats = [i.string for i in stats]
    result.extend(stats)
    return result

def parse_mega(url):
    results = []
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    megas = soup.find('td', string='HP').find_parent('tr').find_parent().find_all('tr')
    for i in range(2, len(megas), 2):
        data = find_mega(megas[i])
        if data is not None:
            results.append(data)
    return results

In [27]:
url1 = 'https://www.serebii.net/xy/megaevolutions.shtml'
url2 = 'https://www.serebii.net/omegarubyalphasapphire/megaevolutions.shtml'
url3 = 'https://www.serebii.net/omegarubyalphasapphire/primal.shtml'
url4 = 'https://www.serebii.net/sunmoon/alolaforms.shtml'
urls = [url1, url2, url3, url4]
for url in urls:
    pokemon_data.extend(parse_mega(url))

### Create data frame and create csv file

In [28]:
import pandas as pd
headers = ['Name', 'Number', 'Types', 'Abilities', 'HP', 'Attack', 
           'Defense', 'Sp. Attack', 'Sp. Defense', 'Speed']
df = pd.DataFrame(pokemon_data, columns=headers)

In [29]:
df.head()

Unnamed: 0,Name,Number,Types,Abilities,HP,Attack,Defense,Sp. Attack,Sp. Defense,Speed
0,Bulbasaur,1,"[grass, poison]","[Chlorophyll, Overgrow]",45,49,49,65,65,45
1,Ivysaur,2,"[grass, poison]","[Chlorophyll, Overgrow]",60,62,63,80,80,60
2,Venusaur,3,"[grass, poison]","[Chlorophyll, Overgrow]",80,82,83,100,100,80
3,Charmander,4,[fire],"[Solar Power, Blaze]",39,52,43,60,50,65
4,Charmeleon,5,[fire],"[Solar Power, Blaze]",58,64,58,80,65,80


In [30]:
df.to_csv('pokemon_data.csv', index=False)