In [1]:
import urllib
import json
import unicodedata
import requests
import bs4
import os
import io
import time
from IPython.display import clear_output

In [2]:
def get_all_pokemon_base_entries():
    base_page_url = "https://bulbapedia.bulbagarden.net/wiki/List_of_Pokémon_by_National_Pokédex_number"
    base_page_content = requests.get(base_page_url).content
    root = bs4.BeautifulSoup(base_page_content, "html5lib").find(id="mw-content-text")

    pokemon_base_entries = {}
    generation_table_tags = root.find_all("table")[1:8]

    for gen_table_tag in generation_table_tags:
        pokemon_tr_tags = gen_table_tag.find("tbody").find_all("tr")[1:]

        for tr_tag in pokemon_tr_tags:
            pokemon_name, number, wikilink = get_pokemon_name_and_wikilink(tr_tag)
            pokemon_base_entries[pokemon_name] = {"wikilink": wikilink, "pokedex_number": number}

    return pokemon_base_entries


def get_pokemon_name_and_wikilink(tr_tag):
    cols = tr_tag.find_all("td", recursive=False)

    number = int(cols[1].string.replace(" ", "")[1:-1])

    details_link_tag = cols[3].find_all("a", recursive=False)[0]
    details_url = "https://bulbapedia.bulbagarden.net" + details_link_tag["href"]

    base_name = details_link_tag.string
    
    return base_name, number, details_url

In [3]:
pokemon_list = get_all_pokemon_base_entries()

In [4]:
baseurl = "http://bulbapedia.bulbagarden.net/w/api.php?"
action = "action=query"
content = "prop=revisions"
rvprop ="rvprop=content"
dataformat = "format=json"

dir_path = "./pokemon_files_raw"
def create_pokemon_file():
    counter = 0
    for name, info in pokemon_list.items():

        title = "titles=" + name + "_(Pok%C3%A9mon)"
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            print('created', dir_path)

        pokedex_number = info["pokedex_number"]
        file_path = f"{dir_path}/[{pokedex_number}]{name}"
        if os.path.isfile(file_path):
            continue
        query = "{}{}&{}&{}&{}&{}".format(baseurl, action, title, content, rvprop, dataformat)
        print(query)
        pokemon_text = requests.get(query).content.decode("utf-8")
        pokemon_text = json.loads(pokemon_text)
        pokemon_text = pokemon_text["query"]["pages"]
        
        pokemon_page_number = next(iter(pokemon_text))
        pokemon_text = pokemon_text[pokemon_page_number]["revisions"][0]

        with io.open(file_path, 'w', encoding='utf-8') as f:
            f.write(u''+ str(pokemon_text))
            f.close()
        counter +=1
        clear_output(wait=True)
        print(f"{int(pokedex_number)} pokemon files created. done. {len(pokemon_list.keys()) - int(pokedex_number)} left")
        


In [5]:
create_pokemon_file()

In [6]:
def get_all_episodes_entries():
    base_page_url = "https://bulbapedia.bulbagarden.net/wiki/List_of_anime_episodes"
    base_page_content = requests.get(base_page_url).content
    
    root = bs4.BeautifulSoup(base_page_content, "html5lib").find(id="mw-content-text")    
    episode_entries = {}
    all_seasons_table_tags = root.find_all("table")[:-5]
    
    season_names = ["Indigo League arc", "Orange League arc", "Johto League arc", "Hoenn League arc", 
                    "Kanto Battle Frontier arc", "Diamond & Pearl", "Best Wishes!", "Best Wishes! Season 2",
                    "Best Wishes! Season 2: Episode N", "	Best Wishes! Season 2: Decolora Adventure!",
                    "XY", "XY&Z", "Sun & Moon"]

    for i in range(len(all_seasons_table_tags)):
        season_tr_tags = all_seasons_table_tags[i].find("tbody").find_all("tr")[1:]
        
        for tr_tag in season_tr_tags:
            season = season_names[i]
            
            cols = tr_tag.find_all("td", recursive=False)
            # column 0 contains the info about the link and the episode number needed for query
            try:
                tag_info = cols[0].find_all("a", recursive=False)[0]
                episode_tag = tag_info["title"]            
            except:
                try:
                    tag_info = cols[1].find_all("a", recursive=False)[0]
                    episode_tag = tag_info["title"]            
                except:
                    continue

            # column 2 contains the english title
            try:
                english_title = cols[2].find_all("i", recursive=False)[0].text
            except:
                english_title = "No english title"
                        
            episode_entries[episode_tag] = {"english_title": english_title, "season": season}        
        
        print(f"All episodes from season: {season_names[i]} fetched.")
    return episode_entries


In [7]:
episodes_list = get_all_episodes_entries()

All episodes from season: Indigo League arc fetched.
All episodes from season: Orange League arc fetched.
All episodes from season: Johto League arc fetched.
All episodes from season: Hoenn League arc fetched.
All episodes from season: Kanto Battle Frontier arc fetched.
All episodes from season: Diamond & Pearl fetched.
All episodes from season: Best Wishes! fetched.
All episodes from season: Best Wishes! Season 2 fetched.
All episodes from season: Best Wishes! Season 2: Episode N fetched.
All episodes from season: 	Best Wishes! Season 2: Decolora Adventure! fetched.
All episodes from season: XY fetched.
All episodes from season: XY&Z fetched.
All episodes from season: Sun & Moon fetched.


In [10]:
baseurl = "http://bulbapedia.bulbagarden.net/w/api.php?"
action = "action=query"
content = "prop=revisions"
rvprop ="rvprop=content"
dataformat = "format=json"

dir_path = "./episodes_files_raw"
def create_episode_files():
    counter = 0
    skipped_counter = 0
    for tag, info in episodes_list.items():
        counter +=1
        title = "titles=" + tag
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            print('created', dir_path)

        file_path = f"{dir_path}/[{counter}]{tag}"
        if os.path.isfile(file_path):
            continue
        query = "{}{}&{}&{}&{}&{}".format(baseurl, action, title, content, rvprop, dataformat)
        print(query)
        try:
            episode_text = requests.get(query).content.decode("utf-8")
            episode_text = json.loads(episode_text)
            episode_text = episode_text["query"]["pages"]

            episode_page_number = next(iter(episode_text))
            episode_text = episode_text[episode_page_number]["revisions"][0]
            
            with io.open(file_path, 'w', encoding='utf-8') as f:
                f.write(u''+ str(episode_text))
                f.close()
            clear_output(wait=True)
            print(f"{counter} episode file created. {len(episodes_list.keys()) - counter} left. Skipped {skipped_counter} unaired episodes.")
        except:
            skipped_counter += 1
            continue


In [11]:
create_episode_files()

1100 episode file created. 0 left. Skipped 3 unaired episodes.
