This script is used to get pokemon sprites from https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number

In [1]:
# External libraries
import requests
import bs4

# Builtins
import os
import concurrent.futures
import functools
import zipfile
import pathlib
import urllib.request
import urllib.parse

### Get sprite urls and prepare for downloading

In [2]:
source = requests.get("https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number")
soup = bs4.BeautifulSoup(source.content)

In [3]:
# Make directries (recursively) if not exist.
pathlib.Path("./data/bulbapedia_pokemon_sprites/").mkdir(parents=True, exist_ok=True)

# Get pokemon info from the table
sprite_urls = []
visited = set()
for tr in soup("tr"):
    if len(tr('td')) > 2 and tr.img:
        td = tr('td')[1]
        id_str = td.text.strip().strip("#")
        attrs = tr.img.attrs
        
        # Deal with sprites with the same code
        if id_str in visited:
            offset = 1
            while id_str + f"_{offset}" in visited:
                offset += 1
            id_str += f"_{offset}"
        
        # Save
        name = urllib.parse.quote(attrs["alt"], safe="")
        sprite_urls.append((id_str, name, "http:" + attrs["src"])) 
        visited.add(id_str)

Example of first 10 of `sprite_urls`
```python
[('001', 'Bulbasaur', 'http://cdn.bulbagarden.net/upload/e/ec/001MS.png'),
 ('002', 'Ivysaur', 'http://cdn.bulbagarden.net/upload/6/6b/002MS.png'),
 ('003', 'Venusaur', 'http://cdn.bulbagarden.net/upload/e/e5/003XYMS.png'),
 ('004', 'Charmander', 'http://cdn.bulbagarden.net/upload/b/bb/004MS.png'),
 ('005', 'Charmeleon', 'http://cdn.bulbagarden.net/upload/d/dc/005MS.png'),
 ('006', 'Charizard', 'http://cdn.bulbagarden.net/upload/6/62/006XYMS.png'),
 ('007', 'Squirtle', 'http://cdn.bulbagarden.net/upload/9/92/007MS.png'),
 ('008', 'Wartortle', 'http://cdn.bulbagarden.net/upload/f/f3/008MS.png'),
 ('009', 'Blastoise', 'http://cdn.bulbagarden.net/upload/5/59/009XYMS.png'),
 ('010', 'Caterpie', 'http://cdn.bulbagarden.net/upload/6/69/010MS.png')]
```

### Download Sprites using 20 threads

In [4]:
# Download images
def download_sprite(sprite_url, overwrite=False):
    id, name, url = sprite_url
    path = f"./data/pokemon_sprites_bulbapedia/{id}_{name}.png"
    if overwrite or not os.path.exists(path):
        with open(path, "wb") as f:
            f.write(requests.get(url).content)

In [5]:
# Multithreading
no_threads = 20
with concurrent.futures.ThreadPoolExecutor(max_workers=no_threads) as executor:
    partial = functools.partial(download_sprite, overwrite=False)
    executor.map(partial, sprite_urls)
print(f"Finish downloading all sprites")

Finish downloading all sprites


### Make a zip file for archiving

In [6]:
path = "./data/pokemon_sprites_bulbapedia/"
with zipfile.ZipFile(f'{path[:-1]}.zip','w') as zip_file: 
    for file in os.listdir(f"{path}"):
        zip_file.write(f"{path}{file}", f"{file}", compress_type=zipfile.ZIP_DEFLATED)
print(f"Zip {len(os.listdir(path))} files to {path[:-1]}.zip successfully")

Zip 954 files to ./data/pokemon_sprites_bulbapedia.zip successfully
